-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAnalyze.hs
116 lines (101 loc) · 4.16 KB
/
Analyze.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
module Analyze where
import Control.Applicative ((<$>))
import Control.Monad ((<=<))
import Prelude hiding (filter, getLine, head, lookup,
null, putStrLn, words)
import Control.Error.Util (note)
import Data.HashMap.Strict (HashMap, lookup, (!))
import qualified Data.HashMap.Strict as H (fromList)
import Data.List.Ordered (nubSort)
import Data.Text (Text)
import Data.Text.ICU (Break, breakWord, breaks, brkBreak,
brkStatus, toLower)
import qualified Data.Text.ICU as I (Word (..))
import Data.Text.ICU.Types (LocaleName (..))
import Data.Vector (Vector, filter, null)
import qualified Data.Vector as V (fromList, toList)
import NLP.Snowball (Algorithm (..), stem)
import Etym (Lang (..), Origin, Target (..), Word (..),
rooted)
import Thesaurus (Thes)
data Fail = NoSyn
| NoEtym
| NotBad
deriving Show
data TextInfo = WordLike { wordLike :: WordInfo }
| NonWord { nonWord :: Text }
deriving Show
data WordInfo = WordInfo
{ orig :: Text
, info :: Either Fail (Vector Text)
} deriving Show
locale :: Lang -> LocaleName
locale (Lang "Dan") = Locale "da"
locale (Lang "Nld") = Locale "nl"
locale (Lang "Eng") = Locale "en"
locale (Lang "Fin") = Locale "fi"
locale (Lang "Fra") = Locale "fr"
locale (Lang "Ger") = Locale "de"
locale (Lang "Hun") = Locale "hu"
locale (Lang "Ita") = Locale "it"
locale (Lang "Nor") = Locale "no"
locale (Lang "Por") = Locale "pt"
locale (Lang "Ron") = Locale "ro"
locale (Lang "Rus") = Locale "ru"
locale (Lang "Spa") = Locale "es"
locale (Lang "Swe") = Locale "sv"
locale (Lang "Tur") = Locale "tr"
algo :: Lang -> Algorithm
algo (Lang "Dan") = Danish
algo (Lang "Nld") = Dutch
algo (Lang "Eng") = English
algo (Lang "Fin") = Finnish
algo (Lang "Fra") = French
algo (Lang "Ger") = German
algo (Lang "Hun") = Hungarian
algo (Lang "Ita") = Italian
algo (Lang "Nor") = Norwegian
algo (Lang "Por") = Portuguese
algo (Lang "Ron") = Romanian
algo (Lang "Rus") = Russian
algo (Lang "Spa") = Spanish
algo (Lang "Swe") = Swedish
algo (Lang "Tur") = Turkish
analyze :: HashMap Target Origin -> Thes ->
InLang -> BadLang -> GoodLang -> Text -> Vector TextInfo
analyze e s (InLang il) bl gl t = annotate (InLang il) h <$> ws where
h = synSet e s (InLang il) bl gl . (brkBreak <$>) .
filter ((== I.Letter) . brkStatus) $ ws
ws = V.fromList . breaks (breakWord $ locale il) $ t
annotate :: InLang -> HashMap Text (Either Fail (Vector Text)) -> Break I.Word ->
TextInfo
annotate (InLang il) h b
| brkStatus b == I.Letter =
WordLike . WordInfo (brkBreak b) $
h ! toLower (locale il) (brkBreak b)
| otherwise = NonWord . brkBreak $ b
newtype BadLang = BadLang { getBadLang :: Lang }
newtype GoodLang = GoodLang { getGoodLang :: Lang }
newtype InLang = InLang { getInLang :: Lang }
synSet :: HashMap Target Origin -> Thes ->
InLang -> BadLang -> GoodLang ->
Vector Text -> HashMap Text (Either Fail (Vector Text))
synSet e s (InLang il) bl'@(BadLang bl) gl ts =
H.fromList $ zip ts' (is <$> ts') where
is w | rooted e bl . Target $ w' =
either (const $ langSyns e s bl' gl w'') Right $ langSyns e s bl' gl w'
| otherwise = Left NotBad where
w' = Word il w
w'' = Word il . stem (algo il) $ w
ts' = nubSort . (toLower (locale il) <$>) . V.toList $ ts
-- Lang of Word ought to be same as language of thesaurus
langSyns :: HashMap Target Origin -> Thes -> BadLang ->
GoodLang -> Word -> Either Fail (Vector Text)
langSyns e s (BadLang bl) (GoodLang gl) (Word il w) =
origLang <=< note NoSyn . lookup w $ s where
origLang ss | null ss' = Left NoEtym
| otherwise = Right ss' where
ss' = filter (\(Target . Word il -> t) ->
rooted e gl t && not (rooted e bl t)) ss