module Distribution.Utils.String
    ( -- * Encode to/from UTF8
      decodeStringUtf8
    , encodeStringUtf8
    ) where

import Data.Word
import Data.Bits
import Data.Char (chr,ord)

-- | Decode 'String' from UTF8-encoded octets.
--
-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
--
-- See also 'encodeStringUtf8'
decodeStringUtf8 :: [Word8] -> String
decodeStringUtf8 :: [Word8] -> String
decodeStringUtf8 = [Word8] -> String
go
  where
    go :: [Word8] -> String
    go :: [Word8] -> String
go []       = []
    go (c :: Word8
c : cs :: [Word8]
cs)
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0x7F = Int -> Char
chr (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
c) Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xBF = Char
replacementChar Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xDF = Word8 -> [Word8] -> String
twoBytes Word8
c [Word8]
cs
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xEF = Int -> Int -> [Word8] -> Int -> String
moreBytes 3 0x800     [Word8]
cs (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0xF)
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xF7 = Int -> Int -> [Word8] -> Int -> String
moreBytes 4 0x10000   [Word8]
cs (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x7)
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xFB = Int -> Int -> [Word8] -> Int -> String
moreBytes 5 0x200000  [Word8]
cs (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3)
      | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0xFD = Int -> Int -> [Word8] -> Int -> String
moreBytes 6 0x4000000 [Word8]
cs (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x1)
      | Bool
otherwise   = Char
replacementChar Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs

    twoBytes :: Word8 -> [Word8] -> String
    twoBytes :: Word8 -> [Word8] -> String
twoBytes c0 :: Word8
c0 (c1 :: Word8
c1:cs' :: [Word8]
cs')
      | Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0xC0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== 0x80
      = let d :: Int
d = (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
c0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x1F) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` 6)
             Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F)
         in if Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= 0x80
               then  Int -> Char
chr Int
d                Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'
               else  Char
replacementChar      Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'
    twoBytes _ cs' :: [Word8]
cs' = Char
replacementChar      Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'

    moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
    moreBytes :: Int -> Int -> [Word8] -> Int -> String
moreBytes 1 overlong :: Int
overlong cs' :: [Word8]
cs' acc :: Int
acc
      | Int
overlong Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
acc, Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= 0x10FFFF, Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0xD800 Bool -> Bool -> Bool
|| 0xDFFF Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
acc
      = Int -> Char
chr Int
acc Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'

      | Bool
otherwise
      = Char
replacementChar Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'

    moreBytes byteCount :: Int
byteCount overlong :: Int
overlong (cn :: Word8
cn:cs' :: [Word8]
cs') acc :: Int
acc
      | Word8
cn Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0xC0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== 0x80
      = Int -> Int -> [Word8] -> Int -> String
moreBytes (Int
byteCountInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Int
overlong [Word8]
cs'
          ((Int
acc Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` 6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
cn Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. 0x3F)

    moreBytes _ _ cs' :: [Word8]
cs' _
      = Char
replacementChar Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
go [Word8]
cs'

    replacementChar :: Char
replacementChar = '\xfffd'


-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- See also 'decodeUtf8'
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 []        = []
encodeStringUtf8 (c :: Char
c:cs :: String
cs)
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x07F' = Word8
w8
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x7FF' = (0xC0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR  6          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xD7FF'= (0xE0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR 12          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xDFFF'= 0xEF Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: 0xBF Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: 0xBD -- U+FFFD
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xFFFF'= (0xE0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR 12          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Bool
otherwise    = (0xf0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR 18          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR 12 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. 0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  where
    w8 :: Word8
w8 = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word8
    w8ShiftR :: Int -> Word8
    w8ShiftR :: Int -> Word8
w8ShiftR = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Int -> Int) -> Int -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftR (Char -> Int
ord Char
c)