commit 7f0b3a166a5865cfada2f03bfbc0c15079eef8ef Author: Frederik Hanghøj Iversen Date: Fri Nov 13 15:49:01 2020 +0100 Hello world diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5be0d00 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.hi +*.o +*.bin +Main \ No newline at end of file diff --git a/Bloom.hs b/Bloom.hs new file mode 100644 index 0000000..4586aaa --- /dev/null +++ b/Bloom.hs @@ -0,0 +1,160 @@ +{-# language RoleAnnotations #-} +{-# language GADTs #-} +{-# language DataKinds #-} +{-# language InstanceSigs #-} +{-# language ScopedTypeVariables #-} +{-# language StandaloneDeriving #-} +{-# language DerivingStrategies #-} +{-# language TypeApplications #-} +{-# language TypeFamilies #-} +{-# options_ghc -Wall -Werror #-} +module Bloom (singleton, insert, Bloom, elem, assocs, hexdump) where + +import Prelude hiding (elem) +import Data.Singletons +import Data.Kind (Type) +import Data.Array.Unboxed (UArray, listArray, (!)) +import qualified Data.Array.Unboxed as Array +import GHC.Types (Nat) +import GHC.TypeLits (KnownNat, natVal) +import Data.Hashable (Hashable) +import qualified Data.Hashable as Hashable +import GHC.Exts (IsList(..)) +import Data.List (sort) +import Data.Binary +import Control.Monad (when) +import Data.Bits +import Data.Foldable (foldl') +import Data.ByteString.Lazy (ByteString) +import qualified Data.ByteString.Lazy as ByteString + +type role Bloom nominal nominal +data Bloom :: Nat -> Type -> Type where + Bloom :: forall a n . Sing n -> UArray Int Bool -> Bloom n a + +deriving stock instance Show (Bloom n a) +instance Eq (Bloom n a) where + Bloom _ a == Bloom _ b = a == b + +instance KnownNat n => Semigroup (Bloom n a) where + (<>) :: Bloom n a -> Bloom n a -> Bloom n a + Bloom _ a <> Bloom _ b = Bloom Sing c + where + c :: UArray Int Bool + c = listArray (0, pred n) $ f <$> [0..] + n :: Int + n = fromIntegral $ natVal @n Proxy + f :: Int -> Bool + f i = a ! i || b ! i + +instance KnownNat n => Monoid (Bloom n a) where + mempty :: Bloom n a + mempty = Bloom Sing c + where + c :: UArray Int Bool + c = listArray (0, pred n) $ repeat False + n :: Int + n = fromIntegral $ natVal @n Proxy + +instance KnownNat n => Binary (Bloom n a) where + -- This is not really a valid instance, because we don't know what + -- `n` and `a` was used to encode what we're `get`ing. + get = do + n <- get @Integer + let n' = natVal @n Proxy + when (n /= n') $ fail $ "Bloom.get: Wrong bucket size. Expected " <> show n' <> ". Got: " <> show n <> "." + Bloom Sing . listArray (0, fromInteger (pred n)) . unpack <$> get @ByteString + put b = do + put $ natVal @n Proxy + put @ByteString $ pack $ snd <$> assocs b + +singleton :: KnownNat n => Hashable a => a -> Bloom n a +singleton a = insert a mempty + +insert :: forall n a . KnownNat n => Hashable a => a -> Bloom n a -> Bloom n a +insert a (Bloom _ b) = Bloom Sing c + where + c :: UArray Int Bool + c = listArray (0, pred n) $ f <$> [0..n] + n :: Int + n = fromIntegral $ natVal @n Proxy + h :: Int + h = hash n a + f :: Int -> Bool + f i = b ! i || h == i + +hash :: Hashable a => Int -> a -> Int +hash n = (`mod` n) . Hashable.hash + +instance (KnownNat n, Hashable a) => IsList (Bloom n a) where + type Item (Bloom n a) = a + fromList :: [] a -> Bloom n a + fromList = Bloom Sing . indicesToArray (pred n) . fmap (hash n) + where + n :: Int + n = fromIntegral $ natVal @n Proxy + toList = error "Unimplemented" + +indicesToArray :: Int -> [] Int -> UArray Int Bool +indicesToArray n = listArray (0, n) . indicesToList + +indicesToList :: [] Int -> [] Bool +indicesToList xs0 = take (succ $ maximum xs0) $ sort xs0 <| [0..] + where + (<|) :: [] Int -> [] Int -> [] Bool + [] <| _ = repeat False + xs@(x:xss) <| (y:yss) = if x <= y then True : (xss <| yss) else False : (xs<|yss) + _ <| _ = error "IMPOSSIBLE" + +elem :: forall a n . KnownNat n => Hashable a => a -> Bloom n a -> Bool +elem a (Bloom _ r) = r ! hash n a + where + n = fromIntegral $ natVal @n Proxy + +assocs :: Bloom n a -> [(Int, Bool)] +assocs (Bloom _ b) = Array.assocs b + +pack :: [] Bool -> ByteString +pack = ByteString.pack . toWords + +unpack :: ByteString -> [] Bool +unpack = (>>= fromWord) . ByteString.unpack + +toWords :: [] Bool -> [] Word8 +toWords = fmap toWord . chunksOf 8 + +chunksOf :: Int -> [] a -> [[a]] +chunksOf n xs = a : case b of [] -> [] ; _ -> chunksOf n b + where + a = take n xs + b = drop n xs + +toWord :: Bits b => [] Bool -> b +toWord xs = foldl' (.|.) zeroBits $ zipWith shift (fmap conv xs) [0..] + where + conv b = if b then bit 0 else zeroBits + +fromWord :: Word8 -> [] Bool +fromWord w = testBit w <$> [0..7] + +hexdump :: Bloom n a -> String +hexdump = hexdump' . toWords . fmap snd . assocs + +hexdump' :: [] Word8 -> String +hexdump' = unlines . zipWith (\n s -> step16 n <> s) [0..] . ls + where + ls :: [Word8] -> [String] + ls = fmap unwords . chunksOf 16 . fmap step8 + step16 :: Word16 -> [] Char + step16 w0 = [r ! fromIntegral d0, r ! fromIntegral d1, r ! fromIntegral d2, r ! fromIntegral d3] + where + (d0, w1) = w0 `divMod` 0x10 + (d1, w2) = w1 `divMod` 0x10 + (d2, w3) = w2 `divMod` 0x10 + (d3, _ ) = w3 `divMod` 0x10 + step8 :: Word8 -> [] Char + step8 w = [r ! a, r ! b] + where + (a, b) = w `divMod` 0x10 + r :: UArray Word8 Char + r = listArray (0, 15) (['0'..'9'] <> ['a'..'f']) diff --git a/Main.hs b/Main.hs new file mode 100644 index 0000000..5b97373 --- /dev/null +++ b/Main.hs @@ -0,0 +1,48 @@ +{-# language TypeApplications #-} +{-# language DataKinds #-} +{-# options_ghc -Wall -Werror #-} +module Main (main) where + +import GHC.Exts (IsList(..)) +import Data.Foldable (traverse_) +import System.Exit (ExitCode(ExitSuccess, ExitFailure), exitWith) +import Bloom (Bloom) +import qualified Bloom +import qualified Data.Binary as Binary +import qualified Data.ByteString.Lazy as ByteString +import System.Directory +import System.FilePath +import System.Environment + +main :: IO () +main = do + b <- cachedLoad + input <- lines <$> getContents + let ws' = filter (not . (`Bloom.elem` b)) input + traverse_ putStrLn ws' + exitWith (if null ws' then ExitSuccess else ExitFailure 1) + +cachedLoad :: IO (Bloom Size String) +cachedLoad = do + cache <- getCachePath + cacheExists <- doesFileExist cache + if cacheExists + then load cache + else do + b <- build + ByteString.writeFile cache $ Binary.encode b + pure b + +getCachePath :: IO FilePath +getCachePath = ( "bloom-spell/words.bin") <$> getEnv "XDG_DATA_HOME" + +load :: FilePath -> IO (Bloom Size String) +load p = Binary.decode @(Bloom Size _) <$> ByteString.readFile p + +build :: IO (Bloom Size String) +build = fromList @(Bloom Size _) . lines <$> readFile dict + +type Size = 0xfffff + +dict :: FilePath +dict = "/usr/share/dict/words" diff --git a/README b/README new file mode 100644 index 0000000..d7cfb1f --- /dev/null +++ b/README @@ -0,0 +1,3 @@ +A spell checker implemented using a bloom filter. + +My first time programming with (emulated) dependent types in Haskell. \ No newline at end of file