diff --git a/Cabal/Cabal.cabal b/Cabal/Cabal.cabal index 2a0f4aef37c..2bc052c6cfb 100644 --- a/Cabal/Cabal.cabal +++ b/Cabal/Cabal.cabal @@ -410,6 +410,7 @@ library Distribution.Types.ComponentRequestedSpec Distribution.Types.TargetInfo Distribution.Utils.NubList + Distribution.Utils.ShortText Distribution.Verbosity Distribution.Version Language.Haskell.Extension @@ -422,6 +423,7 @@ library Distribution.Compat.Prelude Distribution.GetOpt Distribution.Lex + Distribution.Utils.String Distribution.Simple.GHC.Internal Distribution.Simple.GHC.IPI642 Distribution.Simple.GHC.IPIConvert @@ -479,6 +481,7 @@ test-suite unit-tests UnitTests.Distribution.Simple.Utils UnitTests.Distribution.System UnitTests.Distribution.Utils.NubList + UnitTests.Distribution.Utils.ShortText UnitTests.Distribution.Version main-is: UnitTests.hs build-depends: diff --git a/Cabal/Distribution/Package.hs b/Cabal/Distribution/Package.hs index 33cb6944f3b..4cf6838436d 100644 --- a/Cabal/Distribution/Package.hs +++ b/Cabal/Distribution/Package.hs @@ -52,6 +52,7 @@ module Distribution.Package ( import Prelude () import Distribution.Compat.Prelude +import Distribution.Utils.ShortText import Distribution.Version ( Version, VersionRange, anyVersion, thisVersion @@ -74,12 +75,12 @@ import Text.PrettyPrint ((<+>), text) -- This type is opaque since @Cabal-2.0@ -- -- @since 2.0 -newtype PackageName = PackageName String +newtype PackageName = PackageName ShortText deriving (Generic, Read, Show, Eq, Ord, Typeable, Data) -- | Convert 'PackageName' to 'String' unPackageName :: PackageName -> String -unPackageName (PackageName s) = s +unPackageName (PackageName s) = fromShortText s -- | Construct a 'PackageName' from a 'String' -- @@ -90,7 +91,7 @@ unPackageName (PackageName s) = s -- -- @since 2.0 mkPackageName :: String -> PackageName -mkPackageName = PackageName +mkPackageName = PackageName . toShortText instance Binary PackageName diff --git a/Cabal/Distribution/Simple/Utils.hs b/Cabal/Distribution/Simple/Utils.hs index dc62969ca48..060a624d3c5 100644 --- a/Cabal/Distribution/Simple/Utils.hs +++ b/Cabal/Distribution/Simple/Utils.hs @@ -152,18 +152,13 @@ module Distribution.Simple.Utils ( -- * FilePath stuff isAbsoluteOnAnyPlatform, isRelativeOnAnyPlatform, - - -- * 'ShortText' type - ShortText, - toShortText, - fromShortText, ) where import Prelude () import Distribution.Compat.Prelude -import Data.String (IsString(..)) import Distribution.Text +import Distribution.Utils.String import Distribution.Package import Distribution.ModuleName as ModuleName import Distribution.System @@ -188,7 +183,6 @@ import Distribution.Verbosity import qualified Paths_Cabal (version) #endif -import Data.Word (Word8) import Control.Concurrent.MVar ( newEmptyMVar, putMVar, takeMVar ) import Data.Bits @@ -205,16 +199,6 @@ import qualified Data.Set as Set import qualified Data.ByteString as SBS -#if defined(MIN_VERSION_bytestring) -# if MIN_VERSION_bytestring(0,10,4) -# define HAVE_SHORTBYTESTRING 1 -# endif -#endif - -#if HAVE_SHORTBYTESTRING -import qualified Data.ByteString.Short as BS.Short -#endif - import System.Directory ( Permissions(executable), getDirectoryContents, getPermissions , doesDirectoryExist, doesFileExist, removeFile, findExecutable @@ -1390,55 +1374,10 @@ fromUTF8 (c:cs) replacementChar = '\xfffd' fromUTF8BS :: SBS.ByteString -> String -fromUTF8BS = fromUTF8BSImpl . SBS.unpack +fromUTF8BS = decodeStringUtf8 . SBS.unpack fromUTF8LBS :: BS.ByteString -> String -fromUTF8LBS = fromUTF8BSImpl . BS.unpack - -fromUTF8BSImpl :: [Word8] -> String -fromUTF8BSImpl = go - where - go :: [Word8] -> String - go [] = [] - go (c : cs) - | c <= 0x7F = chr (fromIntegral c) : go cs - | c <= 0xBF = replacementChar : go cs - | c <= 0xDF = twoBytes c cs - | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF) - | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7) - | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3) - | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1) - | otherwise = replacementChar : go cs - - twoBytes :: Word8 -> [Word8] -> String - twoBytes c0 (c1:cs') - | c1 .&. 0xC0 == 0x80 - = let d = ((c0 .&. 0x1F) `shiftL` 6) - .|. (c1 .&. 0x3F) - in if d >= 0x80 - then chr (fromIntegral d) : go cs' - else replacementChar : go cs' - twoBytes _ cs' = replacementChar : go cs' - - moreBytes :: Int -> Int -> [Word8] -> Int -> [Char] - moreBytes 1 overlong cs' acc - | overlong <= acc && acc <= 0x10FFFF - && (acc < 0xD800 || 0xDFFF < acc) - && (acc < 0xFFFE || 0xFFFF < acc) - = chr acc : go cs' - - | otherwise - = replacementChar : go cs' - - moreBytes byteCount overlong (cn:cs') acc - | cn .&. 0xC0 == 0x80 - = moreBytes (byteCount-1) overlong cs' - ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F) - - moreBytes _ _ cs' _ - = replacementChar : go cs' - - replacementChar = '\xfffd' +fromUTF8LBS = decodeStringUtf8 . BS.unpack toUTF8 :: String -> String toUTF8 [] = [] @@ -1459,26 +1398,6 @@ toUTF8 (c:cs) : toUTF8 cs where w = ord c --- | Variant of 'toUTF8' operating on 'Word8's directly -toUTF8BSImpl :: String -> [Word8] -toUTF8BSImpl [] = [] -toUTF8BSImpl (c:cs) - | c <= '\x07F' = w - : toUTF8BSImpl cs - | c <= '\x7FF' = (0xC0 .|. (w `shiftR` 6)) - : (0x80 .|. (w .&. 0x3F)) - : toUTF8BSImpl cs - | c <= '\xFFFF'= (0xE0 .|. (w `shiftR` 12)) - : (0x80 .|. ((w `shiftR` 6) .&. 0x3F)) - : (0x80 .|. (w .&. 0x3F)) - : toUTF8BSImpl cs - | otherwise = (0xf0 .|. (w `shiftR` 18)) - : (0x80 .|. ((w `shiftR` 12) .&. 0x3F)) - : (0x80 .|. ((w `shiftR` 6) .&. 0x3F)) - : (0x80 .|. (w .&. 0x3F)) - : toUTF8BSImpl cs - where w = fromIntegral (ord c) :: Word8 - -- | Whether BOM is at the beginning of the input startsWithBOM :: String -> Bool startsWithBOM ('\xFEFF':_) = True @@ -1519,7 +1438,7 @@ withUTF8FileContents name action = -- Uses 'writeFileAtomic', so provides the same guarantees. -- writeUTF8File :: FilePath -> String -> NoCallStackIO () -writeUTF8File path = writeFileAtomic path . BS.Char8.pack . toUTF8 +writeUTF8File path = writeFileAtomic path . BS.pack . encodeStringUtf8 -- | Fix different systems silly line ending conventions normaliseLineEndings :: String -> String @@ -1664,78 +1583,3 @@ isAbsoluteOnAnyPlatform _ = False -- | @isRelativeOnAnyPlatform = not . 'isAbsoluteOnAnyPlatform'@ isRelativeOnAnyPlatform :: FilePath -> Bool isRelativeOnAnyPlatform = not . isAbsoluteOnAnyPlatform - --- ------------------------------------------------------------ --- * 'ShortText' type --- ------------------------------------------------------------ - --- TODO: if we start using this internally for more opaque types in --- Cabal then we will likely need to promote it to it's own module in --- Distribution.* to avoid cycles, or just to maintain the sanity of --- the Distribution.* vs Distribution.Simple.* distinction. - --- | Construct 'ShortText' from 'String' -toShortText :: String -> ShortText - --- | Convert 'ShortText' to 'String' -fromShortText :: ShortText -> String - --- | Compact representation of short 'Strings' --- --- The data is stored internally as UTF8 in an --- 'BS.Short.ShortByteString' when compiled against @bytestring >= --- 0.10.4@, and otherwise the fallback is to use plain old non-compat --- '[Char]'. --- --- Note: This type is for internal uses (such as e.g. 'PackageName') --- and shall not be exposed in Cabal's API --- --- @since 2.0.0 -#if HAVE_SHORTBYTESTRING -newtype ShortText = ST { unST :: BS.Short.ShortByteString } - deriving (Eq,Ord,Generic) - -# if MIN_VERSION_binary(0,8,1) -instance Binary ShortText where - put = put . unST - get = fmap ST get -# else -instance Binary ShortText where - put = put . BS.Short.fromShort . unST - get = fmap (ST . BS.Short.toShort) get -# endif - -toShortText = ST . BS.Short.pack . toUTF8BSImpl - -fromShortText = fromUTF8BSImpl . BS.Short.unpack . unST -#else -newtype ShortText = ST { unST :: String } - deriving (Eq,Ord,Generic) - -instance Binary ShortText where - put = put . toUTF8BSImpl . unST - get = fmap (ST . fromUTF8BSImpl) get - -toShortText = ST - -fromShortText = unST -#endif - -instance NFData ShortText where - rnf = rnf . unST - -instance Show ShortText where - show = show . fromShortText - -instance Read ShortText where - readsPrec p = map (first toShortText) . readsPrec p - -instance Semigroup ShortText where - ST a <> ST b = ST (mappend a b) - -instance Monoid ShortText where - mempty = ST mempty - mappend = (<>) - -instance IsString ShortText where - fromString = toShortText diff --git a/Cabal/Distribution/Utils/ShortText.hs b/Cabal/Distribution/Utils/ShortText.hs new file mode 100644 index 00000000000..71f008808bf --- /dev/null +++ b/Cabal/Distribution/Utils/ShortText.hs @@ -0,0 +1,96 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE DeriveGeneric #-} + +module Distribution.Utils.ShortText + ( -- * 'ShortText' type + ShortText + , toShortText + , fromShortText + + -- * internal utilities + , decodeStringUtf8 + , encodeStringUtf8 + ) where + +import Prelude () +import Distribution.Compat.Prelude +import Distribution.Utils.String + +import Data.String (IsString(..)) + +#if defined(MIN_VERSION_bytestring) +# if MIN_VERSION_bytestring(0,10,4) +# define HAVE_SHORTBYTESTRING 1 +# endif +#endif + +#if HAVE_SHORTBYTESTRING +import qualified Data.ByteString.Short as BS.Short +#endif + +-- | Construct 'ShortText' from 'String' +toShortText :: String -> ShortText + +-- | Convert 'ShortText' to 'String' +fromShortText :: ShortText -> String + +-- | Compact representation of short 'Strings' +-- +-- The data is stored internally as UTF8 in an +-- 'BS.Short.ShortByteString' when compiled against @bytestring >= +-- 0.10.4@, and otherwise the fallback is to use plain old non-compat +-- '[Char]'. +-- +-- Note: This type is for internal uses (such as e.g. 'PackageName') +-- and shall not be exposed in Cabal's API +-- +-- @since 2.0.0 +#if HAVE_SHORTBYTESTRING +newtype ShortText = ST { unST :: BS.Short.ShortByteString } + deriving (Eq,Ord,Generic,Data,Typeable) + +# if MIN_VERSION_binary(0,8,1) +instance Binary ShortText where + put = put . unST + get = fmap ST get +# else +instance Binary ShortText where + put = put . BS.Short.fromShort . unST + get = fmap (ST . BS.Short.toShort) get +# endif + +toShortText = ST . BS.Short.pack . encodeStringUtf8 + +fromShortText = decodeStringUtf8 . BS.Short.unpack . unST +#else +newtype ShortText = ST { unST :: String } + deriving (Eq,Ord,Generic,Data,Typeable) + +instance Binary ShortText where + put = put . encodeStringUtf8 . unST + get = fmap (ST . decodeStringUtf8) get + +toShortText = ST + +fromShortText = unST +#endif + +instance NFData ShortText where + rnf = rnf . unST + +instance Show ShortText where + show = show . fromShortText + +instance Read ShortText where + readsPrec p = map (first toShortText) . readsPrec p + +instance Semigroup ShortText where + ST a <> ST b = ST (mappend a b) + +instance Monoid ShortText where + mempty = ST mempty + mappend = (<>) + +instance IsString ShortText where + fromString = toShortText diff --git a/Cabal/Distribution/Utils/String.hs b/Cabal/Distribution/Utils/String.hs new file mode 100644 index 00000000000..7db2a2c2c4c --- /dev/null +++ b/Cabal/Distribution/Utils/String.hs @@ -0,0 +1,82 @@ +module Distribution.Utils.String + ( -- * Encode to/from UTF8 + decodeStringUtf8 + , encodeStringUtf8 + ) where + +import Data.Word +import Data.Bits +import Data.Char (chr,ord) + +-- | Decode 'String' from UTF8-encoded octets. +-- +-- Invalid data will be decoded as the replacement character (@U+FFFD@) +-- +-- See also 'encodeStringUtf8' +decodeStringUtf8 :: [Word8] -> String +decodeStringUtf8 = go + where + go :: [Word8] -> String + go [] = [] + go (c : cs) + | c <= 0x7F = chr (fromIntegral c) : go cs + | c <= 0xBF = replacementChar : go cs + | c <= 0xDF = twoBytes c cs + | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF) + | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7) + | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3) + | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1) + | otherwise = replacementChar : go cs + + twoBytes :: Word8 -> [Word8] -> String + twoBytes c0 (c1:cs') + | c1 .&. 0xC0 == 0x80 + = let d = ((c0 .&. 0x1F) `shiftL` 6) + .|. (c1 .&. 0x3F) + in if d >= 0x80 + then chr (fromIntegral d) : go cs' + else replacementChar : go cs' + twoBytes _ cs' = replacementChar : go cs' + + moreBytes :: Int -> Int -> [Word8] -> Int -> [Char] + moreBytes 1 overlong cs' acc + | overlong <= acc && acc <= 0x10FFFF + && (acc < 0xD800 || 0xDFFF < acc) + && (acc < 0xFFFE || 0xFFFF < acc) + = chr acc : go cs' + + | otherwise + = replacementChar : go cs' + + moreBytes byteCount overlong (cn:cs') acc + | cn .&. 0xC0 == 0x80 + = moreBytes (byteCount-1) overlong cs' + ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F) + + moreBytes _ _ cs' _ + = replacementChar : go cs' + + replacementChar = '\xfffd' + + +-- | Encode 'String' to a list of UTF8-encoded octets +-- +-- See also 'decodeUtf8' +encodeStringUtf8 :: String -> [Word8] +encodeStringUtf8 [] = [] +encodeStringUtf8 (c:cs) + | c <= '\x07F' = w + : encodeStringUtf8 cs + | c <= '\x7FF' = (0xC0 .|. (w `shiftR` 6)) + : (0x80 .|. (w .&. 0x3F)) + : encodeStringUtf8 cs + | c <= '\xFFFF'= (0xE0 .|. (w `shiftR` 12)) + : (0x80 .|. ((w `shiftR` 6) .&. 0x3F)) + : (0x80 .|. (w .&. 0x3F)) + : encodeStringUtf8 cs + | otherwise = (0xf0 .|. (w `shiftR` 18)) + : (0x80 .|. ((w `shiftR` 12) .&. 0x3F)) + : (0x80 .|. ((w `shiftR` 6) .&. 0x3F)) + : (0x80 .|. (w .&. 0x3F)) + : encodeStringUtf8 cs + where w = fromIntegral (ord c) :: Word8 diff --git a/Cabal/changelog b/Cabal/changelog index 9761ac8c638..8ef7b6e57d4 100644 --- a/Cabal/changelog +++ b/Cabal/changelog @@ -90,7 +90,7 @@ call site/stack of a logging output respectively (these are only supported if Cabal is built with GHC 8.0/7.10.2 or greater, respectively). - * New `Distribution.Simple.Utils.ShortText` type for representing + * New `Distribution.Utils.ShortText.ShortText` type for representing short text strings compactly (#3898) 1.24.0.0 Ryan Thomas March 2016 diff --git a/Cabal/tests/UnitTests.hs b/Cabal/tests/UnitTests.hs index 3ffaba520ec..08cba874ec4 100644 --- a/Cabal/tests/UnitTests.hs +++ b/Cabal/tests/UnitTests.hs @@ -21,6 +21,7 @@ import qualified UnitTests.Distribution.Simple.Program.Internal import qualified UnitTests.Distribution.Simple.Utils import qualified UnitTests.Distribution.System import qualified UnitTests.Distribution.Utils.NubList +import qualified UnitTests.Distribution.Utils.ShortText import qualified UnitTests.Distribution.Version (versionTests) tests :: Int -> TestTree @@ -45,6 +46,8 @@ tests mtimeChangeCalibrated = UnitTests.Distribution.Simple.Utils.tests , testGroup "Distribution.Utils.NubList" UnitTests.Distribution.Utils.NubList.tests + , testGroup "Distribution.Utils.ShortText" + UnitTests.Distribution.Utils.ShortText.tests , testGroup "Distribution.System" UnitTests.Distribution.System.tests , testGroup "Distribution.Version" diff --git a/Cabal/tests/UnitTests/Distribution/Simple/Utils.hs b/Cabal/tests/UnitTests/Distribution/Simple/Utils.hs index 63b064f7b1a..59a9b4836de 100644 --- a/Cabal/tests/UnitTests/Distribution/Simple/Utils.hs +++ b/Cabal/tests/UnitTests/Distribution/Simple/Utils.hs @@ -5,7 +5,6 @@ module UnitTests.Distribution.Simple.Utils import Distribution.Simple.Utils import Distribution.Verbosity -import Data.Monoid as Mon import Data.IORef import System.Directory ( doesDirectoryExist, doesFileExist , getTemporaryDirectory @@ -16,9 +15,6 @@ import qualified Control.Exception as Exception import Test.Tasty import Test.Tasty.HUnit -import Test.Tasty.QuickCheck - -import Distribution.Compat.Binary (encode, decode) withTempFileTest :: Assertion withTempFileTest = do @@ -89,20 +85,6 @@ rawSystemStdInOutTextDecodingTest -prop_ShortTextOrd :: String -> String -> Bool -prop_ShortTextOrd a b = compare a b == compare (toShortText a) (toShortText b) - -prop_ShortTextMonoid :: String -> String -> Bool -prop_ShortTextMonoid a b = Mon.mappend a b == fromShortText (mappend (toShortText a) (toShortText b)) - -prop_ShortTextId :: String -> Bool -prop_ShortTextId a = (fromShortText . toShortText) a == a - -prop_ShortTextBinaryId :: String -> Bool -prop_ShortTextBinaryId a = (decode . encode) a' == a' - where - a' = toShortText a - tests :: [TestTree] tests = [ testCase "withTempFile works as expected" $ @@ -115,9 +97,4 @@ tests = withTempDirRemovedTest , testCase "rawSystemStdInOut reports text decoding errors" $ rawSystemStdInOutTextDecodingTest - - , testProperty "ShortText Id" prop_ShortTextId - , testProperty "ShortText Ord" prop_ShortTextOrd - , testProperty "ShortText Monoid" prop_ShortTextMonoid - , testProperty "ShortText BinaryId" prop_ShortTextBinaryId ] diff --git a/Cabal/tests/UnitTests/Distribution/Utils/ShortText.hs b/Cabal/tests/UnitTests/Distribution/Utils/ShortText.hs new file mode 100644 index 00000000000..73298f361de --- /dev/null +++ b/Cabal/tests/UnitTests/Distribution/Utils/ShortText.hs @@ -0,0 +1,33 @@ +module UnitTests.Distribution.Utils.ShortText + ( tests + ) where + +import Data.Monoid as Mon +import Test.Tasty +import Test.Tasty.QuickCheck + +import Distribution.Compat.Binary (encode, decode) + +import Distribution.Utils.ShortText + +prop_ShortTextOrd :: String -> String -> Bool +prop_ShortTextOrd a b = compare a b == compare (toShortText a) (toShortText b) + +prop_ShortTextMonoid :: String -> String -> Bool +prop_ShortTextMonoid a b = Mon.mappend a b == fromShortText (mappend (toShortText a) (toShortText b)) + +prop_ShortTextId :: String -> Bool +prop_ShortTextId a = (fromShortText . toShortText) a == a + +prop_ShortTextBinaryId :: String -> Bool +prop_ShortTextBinaryId a = (decode . encode) a' == a' + where + a' = toShortText a + +tests :: [TestTree] +tests = + [ testProperty "ShortText Id" prop_ShortTextId + , testProperty "ShortText Ord" prop_ShortTextOrd + , testProperty "ShortText Monoid" prop_ShortTextMonoid + , testProperty "ShortText BinaryId" prop_ShortTextBinaryId + ]