Hello world

This commit is contained in:
Frederik Hanghøj Iversen 2020-11-13 15:49:01 +01:00
commit 7f0b3a166a
4 changed files with 215 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
*.hi
*.o
*.bin
Main

160
Bloom.hs Normal file
View File

@ -0,0 +1,160 @@
{-# language RoleAnnotations #-}
{-# language GADTs #-}
{-# language DataKinds #-}
{-# language InstanceSigs #-}
{-# language ScopedTypeVariables #-}
{-# language StandaloneDeriving #-}
{-# language DerivingStrategies #-}
{-# language TypeApplications #-}
{-# language TypeFamilies #-}
{-# options_ghc -Wall -Werror #-}
module Bloom (singleton, insert, Bloom, elem, assocs, hexdump) where
import Prelude hiding (elem)
import Data.Singletons
import Data.Kind (Type)
import Data.Array.Unboxed (UArray, listArray, (!))
import qualified Data.Array.Unboxed as Array
import GHC.Types (Nat)
import GHC.TypeLits (KnownNat, natVal)
import Data.Hashable (Hashable)
import qualified Data.Hashable as Hashable
import GHC.Exts (IsList(..))
import Data.List (sort)
import Data.Binary
import Control.Monad (when)
import Data.Bits
import Data.Foldable (foldl')
import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as ByteString
type role Bloom nominal nominal
data Bloom :: Nat -> Type -> Type where
Bloom :: forall a n . Sing n -> UArray Int Bool -> Bloom n a
deriving stock instance Show (Bloom n a)
instance Eq (Bloom n a) where
Bloom _ a == Bloom _ b = a == b
instance KnownNat n => Semigroup (Bloom n a) where
(<>) :: Bloom n a -> Bloom n a -> Bloom n a
Bloom _ a <> Bloom _ b = Bloom Sing c
where
c :: UArray Int Bool
c = listArray (0, pred n) $ f <$> [0..]
n :: Int
n = fromIntegral $ natVal @n Proxy
f :: Int -> Bool
f i = a ! i || b ! i
instance KnownNat n => Monoid (Bloom n a) where
mempty :: Bloom n a
mempty = Bloom Sing c
where
c :: UArray Int Bool
c = listArray (0, pred n) $ repeat False
n :: Int
n = fromIntegral $ natVal @n Proxy
instance KnownNat n => Binary (Bloom n a) where
-- This is not really a valid instance, because we don't know what
-- `n` and `a` was used to encode what we're `get`ing.
get = do
n <- get @Integer
let n' = natVal @n Proxy
when (n /= n') $ fail $ "Bloom.get: Wrong bucket size. Expected " <> show n' <> ". Got: " <> show n <> "."
Bloom Sing . listArray (0, fromInteger (pred n)) . unpack <$> get @ByteString
put b = do
put $ natVal @n Proxy
put @ByteString $ pack $ snd <$> assocs b
singleton :: KnownNat n => Hashable a => a -> Bloom n a
singleton a = insert a mempty
insert :: forall n a . KnownNat n => Hashable a => a -> Bloom n a -> Bloom n a
insert a (Bloom _ b) = Bloom Sing c
where
c :: UArray Int Bool
c = listArray (0, pred n) $ f <$> [0..n]
n :: Int
n = fromIntegral $ natVal @n Proxy
h :: Int
h = hash n a
f :: Int -> Bool
f i = b ! i || h == i
hash :: Hashable a => Int -> a -> Int
hash n = (`mod` n) . Hashable.hash
instance (KnownNat n, Hashable a) => IsList (Bloom n a) where
type Item (Bloom n a) = a
fromList :: [] a -> Bloom n a
fromList = Bloom Sing . indicesToArray (pred n) . fmap (hash n)
where
n :: Int
n = fromIntegral $ natVal @n Proxy
toList = error "Unimplemented"
indicesToArray :: Int -> [] Int -> UArray Int Bool
indicesToArray n = listArray (0, n) . indicesToList
indicesToList :: [] Int -> [] Bool
indicesToList xs0 = take (succ $ maximum xs0) $ sort xs0 <| [0..]
where
(<|) :: [] Int -> [] Int -> [] Bool
[] <| _ = repeat False
xs@(x:xss) <| (y:yss) = if x <= y then True : (xss <| yss) else False : (xs<|yss)
_ <| _ = error "IMPOSSIBLE"
elem :: forall a n . KnownNat n => Hashable a => a -> Bloom n a -> Bool
elem a (Bloom _ r) = r ! hash n a
where
n = fromIntegral $ natVal @n Proxy
assocs :: Bloom n a -> [(Int, Bool)]
assocs (Bloom _ b) = Array.assocs b
pack :: [] Bool -> ByteString
pack = ByteString.pack . toWords
unpack :: ByteString -> [] Bool
unpack = (>>= fromWord) . ByteString.unpack
toWords :: [] Bool -> [] Word8
toWords = fmap toWord . chunksOf 8
chunksOf :: Int -> [] a -> [[a]]
chunksOf n xs = a : case b of [] -> [] ; _ -> chunksOf n b
where
a = take n xs
b = drop n xs
toWord :: Bits b => [] Bool -> b
toWord xs = foldl' (.|.) zeroBits $ zipWith shift (fmap conv xs) [0..]
where
conv b = if b then bit 0 else zeroBits
fromWord :: Word8 -> [] Bool
fromWord w = testBit w <$> [0..7]
hexdump :: Bloom n a -> String
hexdump = hexdump' . toWords . fmap snd . assocs
hexdump' :: [] Word8 -> String
hexdump' = unlines . zipWith (\n s -> step16 n <> s) [0..] . ls
where
ls :: [Word8] -> [String]
ls = fmap unwords . chunksOf 16 . fmap step8
step16 :: Word16 -> [] Char
step16 w0 = [r ! fromIntegral d0, r ! fromIntegral d1, r ! fromIntegral d2, r ! fromIntegral d3]
where
(d0, w1) = w0 `divMod` 0x10
(d1, w2) = w1 `divMod` 0x10
(d2, w3) = w2 `divMod` 0x10
(d3, _ ) = w3 `divMod` 0x10
step8 :: Word8 -> [] Char
step8 w = [r ! a, r ! b]
where
(a, b) = w `divMod` 0x10
r :: UArray Word8 Char
r = listArray (0, 15) (['0'..'9'] <> ['a'..'f'])

48
Main.hs Normal file
View File

@ -0,0 +1,48 @@
{-# language TypeApplications #-}
{-# language DataKinds #-}
{-# options_ghc -Wall -Werror #-}
module Main (main) where
import GHC.Exts (IsList(..))
import Data.Foldable (traverse_)
import System.Exit (ExitCode(ExitSuccess, ExitFailure), exitWith)
import Bloom (Bloom)
import qualified Bloom
import qualified Data.Binary as Binary
import qualified Data.ByteString.Lazy as ByteString
import System.Directory
import System.FilePath
import System.Environment
main :: IO ()
main = do
b <- cachedLoad
input <- lines <$> getContents
let ws' = filter (not . (`Bloom.elem` b)) input
traverse_ putStrLn ws'
exitWith (if null ws' then ExitSuccess else ExitFailure 1)
cachedLoad :: IO (Bloom Size String)
cachedLoad = do
cache <- getCachePath
cacheExists <- doesFileExist cache
if cacheExists
then load cache
else do
b <- build
ByteString.writeFile cache $ Binary.encode b
pure b
getCachePath :: IO FilePath
getCachePath = (</> "bloom-spell/words.bin") <$> getEnv "XDG_DATA_HOME"
load :: FilePath -> IO (Bloom Size String)
load p = Binary.decode @(Bloom Size _) <$> ByteString.readFile p
build :: IO (Bloom Size String)
build = fromList @(Bloom Size _) . lines <$> readFile dict
type Size = 0xfffff
dict :: FilePath
dict = "/usr/share/dict/words"

3
README Normal file
View File

@ -0,0 +1,3 @@
A spell checker implemented using a bloom filter.
My first time programming with (emulated) dependent types in Haskell.