-
Notifications
You must be signed in to change notification settings - Fork 1
/
SphinxEscape.hs
247 lines (204 loc) · 7.73 KB
/
SphinxEscape.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
{-# LANGUAGE OverloadedStrings, RecordWildCards, ScopedTypeVariables, FlexibleContexts #-}
module SphinxEscape where
import Control.Applicative
import Data.Char
import Data.Functor.Identity (Identity)
import Data.List
import Data.String.Utils (strip)
import Text.Parsec hiding (many, (<|>))
-- | Extract tag and author filters and prepare resulting
-- query string for submission to Sphinx.
transformQuery :: String -- ^ Original query string
-> ([String], [String], String) -- ^ tag names, author names, query
transformQuery q = (ts', as', q')
where
(ts, as, qs) = extractFilters $ parseFilters q
(ts', as') = formatFilters ts as
q' = formatQuery . parseQueryNoFilters $ formatQueryNoEscaping qs
extractFilters :: [Expression] -> ([Expression], [Expression], [Expression])
extractFilters es = (ts, as, q')
where
(ts, q) = partition isTagFilter es
(as, q') = partition isAuthorFilter q
formatFilters :: [Expression] -> [Expression] -> ([String], [String])
formatFilters ts as = (map tagNameFromExpression ts, map authorNameFromExpression as)
formatQueryWith :: (Expression -> String) -> [Expression] -> String
formatQueryWith f = strip . intercalate " " . map (strip . f)
-- Format query expressions without escaping special characters.
-- This allows a second pass to recognize boolean operators
-- as special characters or words.
formatQueryNoEscaping :: [Expression] -> String
formatQueryNoEscaping = formatQueryWith toStringNoEscaping
-- Format query expressions with escaping of special characters.
formatQuery :: [Expression] -> String
formatQuery = formatQueryWith toString
-- Just a simplified syntax tree. Besides this, all other input has its
-- non-alphanumeric characters stripped, including double and single quotes and
-- parentheses
data Expression =
TagFilter String
| AuthorFilter String
| Literal String
| Phrase String
| AndOrExpr Conj Expression Expression
deriving Show
data Conj = And | Or deriving Show
toStringNoEscaping :: Expression -> String
toStringNoEscaping (TagFilter s) = "tag:" ++ maybeQuote s
toStringNoEscaping (AuthorFilter s) = "author:" ++ maybeQuote s
toStringNoEscaping (Literal s) = s
toStringNoEscaping (Phrase s) = quote s -- no need to escape the contents
toStringNoEscaping (AndOrExpr c a b) =
let a' = toStringNoEscaping a
b' = toStringNoEscaping b
c' = conjToString c
-- if either a' or b' is just whitespace, just choose one or the other
in case (all isSpace a', all isSpace b') of
(True, False) -> b'
(False, True) -> a'
(False, False) -> a' ++ c' ++ b'
_ -> ""
-- escapes expression to string to pass to sphinx
toString :: Expression -> String
toString (TagFilter s) = "tag:" ++ maybeQuote (escapeString s)
toString (AuthorFilter s) = "author:" ++ maybeQuote (escapeString s)
toString (Literal s) = escapeString s
toString (Phrase s) = quote s -- no need to escape the contents
toString (AndOrExpr c a b) =
let a' = toString a
b' = toString b
c' = conjToString c
-- if either a' or b' is just whitespace, just choose one or the other
in case (all isSpace a', all isSpace b') of
(True, False) -> b'
(False, True) -> a'
(False, False) -> a' ++ c' ++ b'
_ -> ""
quote :: String -> String
quote s = "\"" ++ s ++ "\""
maybeQuote :: String -> String
maybeQuote s = if any isSpace s then quote s else s
conjToString :: Conj -> String
conjToString And = " & "
conjToString Or = " | "
-- removes all non-alphanumerics from literal strings that could be parsed
-- mistakenly as Sphinx Extended Query operators
escapeString :: String -> String
escapeString = map stripAlphaNum
stripAlphaNum :: Char -> Char
stripAlphaNum s | isAlphaNum s = s
| otherwise = ' '
-----------------------------------------------------------------------
-- Parse filters
type Parser' = ParsecT String () Identity
parseFilters :: String -> [Expression]
parseFilters inp =
case Text.Parsec.parse (many filtersAndLiterals) "" inp of
Left x -> error $ "parser failed: " ++ show x
Right xs -> xs
filtersAndLiterals :: Parser' Expression
filtersAndLiterals = try tagFilter <|> try authorFilter <|> try phrase <|> literal
tagFilter :: Parser' Expression
tagFilter = do
try (string "tag:") <|> try (string "@(tag_list)") <|> string "@tag_list"
many space
x <- (try phrase <|> literal)
let
s = case x of
Phrase p -> p
Literal l -> l
otherwise -> "" -- will never be returned (parse error)
return $ TagFilter s
authorFilter :: Parser' Expression
authorFilter = do
string "author:"
many space
x <- (try phrase <|> literal)
let
s = case x of
Phrase p -> p
Literal l -> l
otherwise -> "" -- will never be returned (parse error)
return $ AuthorFilter s
phrase :: Parser' Expression
phrase = do
Phrase <$>
(between (char '"') (char '"') (many tagChar))
where tagChar =
char '\\' *> (char '"')
<|> satisfy (`notElem` ("\"\\" :: String))
-- char '"'
-- xs <- manyTill anyChar (char '"')
-- return . Phrase $ xs
-- Copied from http://book.realworldhaskell.org/read/using-parsec.html
-- p_string :: CharParser () String
-- p_string = between (char '\"') (char '\"') (many jchar)
-- where jchar = char '\\' *> (p_escape <|> p_unicode)
-- <|> satisfy (`notElem` "\"\\")
--
literalStop :: Parser' ()
literalStop = (choice [
lookAhead (tagFilter >> return ())
, lookAhead (authorFilter >> return ())
, lookAhead (phrase >> return ())
, (space >> return ())
, eof
])
<?> "literalStop"
literal :: Parser' Expression
literal = do
a <- anyChar
xs <- manyTill anyChar (try literalStop)
return . Literal $ a:xs
-----------------------------------------------------------------------
-- Parse query string after tag and author filters have been removed.
parseQueryNoFilters :: String -> [Expression]
parseQueryNoFilters inp =
case Text.Parsec.parse (many expressionNoFilters) "" inp of
Left x -> error $ "parser failed: " ++ show x
Right xs -> xs
expressionNoFilters :: Parser' Expression
expressionNoFilters = try andOrExpr <|> try phrase <|> literalNoFilters
andOrExpr :: Parser' Expression
andOrExpr = do
a <- (try phrase <|> literalNoFilters)
x <- try conjExpr
b <- expressionNoFilters -- recursion
return $ AndOrExpr x a b
conjExpr :: Parser' Conj
conjExpr = andExpr <|> orExpr
andExpr :: Parser' Conj
andExpr = mkConjExpr ["and", "AND", "&"] And
orExpr :: Parser' Conj
orExpr = mkConjExpr ["or", "OR", "|"] Or
mkConjExpr :: [String] -> Conj -> Parser' Conj
mkConjExpr xs t =
try (many1 space >> choice (map (string . (++" ")) xs))
>> return t
literalStopNoFilters :: Parser' ()
literalStopNoFilters = (choice [
lookAhead (conjExpr >> return ())
, lookAhead (phrase >> return ())
, (space >> return ())
, eof
])
<?> "literalStopNoFilters'"
literalNoFilters :: Parser' Expression
literalNoFilters = do
a <- anyChar
xs <- manyTill anyChar (try literalStopNoFilters)
return . Literal $ a:xs
-----------------------------------------------------------------------
-- Helper functions
isTagFilter :: Expression -> Bool
isTagFilter (TagFilter _) = True
isTagFilter _ = False
tagNameFromExpression :: Expression -> String
tagNameFromExpression (TagFilter t) = t
tagNameFromExpression _ = error "tagNameFromExpression: not tag"
isAuthorFilter :: Expression -> Bool
isAuthorFilter (AuthorFilter _) = True
isAuthorFilter _ = False
authorNameFromExpression :: Expression -> String
authorNameFromExpression (AuthorFilter t) = t
authorNameFromExpression _ = error "authorNameFromExpression: not author"