-
-
Notifications
You must be signed in to change notification settings - Fork 21
/
parser.lisp
139 lines (123 loc) · 4.53 KB
/
parser.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
(in-package #:org.shirakumo.plump.parser)
(eval-when (:compile-toplevel :load-toplevel :execute)
(defvar *whitespace* '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page)))
(defvar *root*)
(defvar *tagstack* ())
(define-matcher whitespace (find *whitespace*))
(define-matcher name (or (in #\a #\z) (in #\? #\Z) (in #\- #\:) (any #\\ #\_ #\! #\# #\[ #\])))
(define-matcher tag-end (or (and (is #\/) (next (is #\>))) (is #\>)))
(defun skip-whitespace ()
(loop while (find (peek) *whitespace*)
do (advance)))
(defun read-name ()
(consume-until (make-matcher (or (not :name) :tag-end))))
(defun read-text ()
(make-text-node
*root*
(decode-entities
(consume-until (make-matcher (and (is #\<) (next :name)))))))
;; Robustify against strings inside containing >
(defun read-tag-contents ()
(decode-entities
(consume-until (make-matcher :tag-end))))
(defun read-children ()
(let* ((close-tag (concatenate 'string "</" (tag-name *root*)))
(*tagstack* (cons close-tag *tagstack*)))
(catch close-tag
(loop while (peek)
do (or (read-tag) (read-text))))))
(defun read-attribute-value ()
(decode-entities
(let ((first (peek)))
(case first
(#\" (prog2 (advance)
(consume-until (make-matcher (is #\")))
(advance)))
(#\' (prog2 (advance)
(consume-until (make-matcher (is #\')))
(advance)))
(T (consume-until (make-matcher (or :whitespace :tag-end))))))))
(defun read-attribute-name ()
(consume-until (make-matcher (or (is #\=) :whitespace :tag-end))))
(defun read-attribute ()
(let ((name (read-attribute-name))
(value ""))
(skip-whitespace)
(let ((next (consume)))
(cond
((and next (char= next #\=))
(skip-whitespace)
(setf value (read-attribute-value)))
((not next)
(cons name NIL))
(T
(unread))))
(cons name value)))
(defun read-attributes ()
(loop with table = (make-attribute-map)
for char = (peek)
do (case char
((#\/ #\> NIL)
(return table))
(#.*whitespace*
(advance))
(T
(let ((entry (read-attribute)))
(setf (gethash (car entry) table) (cdr entry)))))))
(defun read-standard-tag (name)
(let* ((closing (consume))
(attrs (if (member closing *whitespace* :test #'eql)
(prog1 (read-attributes)
(setf closing (consume)))
(make-attribute-map))))
(case closing
(#\/
(advance)
(make-element *root* name :attributes attrs))
(#\>
(let ((*root* (make-element *root* name :attributes attrs)))
(read-children)
*root*)))))
(defun read-tag ()
(if (and (char= #\< (or (consume) #\ ))
(funcall (make-matcher :name)))
(let ((name (read-name)))
(or (do-tag-parsers (test func (read-standard-tag name))
(when (funcall (the function test) name)
(return (funcall (the function func) name))))
(progn ;; It seems we can't parse this tag for some reason,
;; read it as a text node instead. In order to avoid the
;; auto-breaking of the text node on < and a subsequently
;; resulting infinite-loop, don't unwind fully and instead
;; prepend the < manually.
(unread-n (length name))
(let ((text (read-text)))
(setf (text text) (concatenate 'string "<" (text text)))
text))))
(progn (unread) NIL)))
(defun read-root (&optional (root (make-root)))
(let ((*root* root))
(loop while (peek)
do (or (read-tag) (read-text)))
*root*))
(defun slurp-stream (stream)
(declare (stream stream))
(with-output-to-string (string)
(let ((buffer (make-array 4096 :element-type 'character)))
(loop for bytes = (read-sequence buffer stream)
do (write-sequence buffer string :start 0 :end bytes)
while (= bytes 4096)))))
(defgeneric parse (input &key root)
(:method ((input string) &key root)
(let ((input (typecase input
(simple-string input)
(string (copy-seq input)))))
(with-lexer-environment (input)
(if root
(read-root root)
(read-root)))))
(:method ((input pathname) &key root)
(with-open-file (stream input :direction :input)
(parse stream :root root)))
(:method ((input stream) &key root)
(parse (slurp-stream input) :root root)))