-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathPOSTagger.js
124 lines (116 loc) · 4.04 KB
/
POSTagger.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*!
* jsPOS
*
* Copyright 2010, Percy Wegmann
* Licensed under the LGPLv3 license
* http://www.opensource.org/licenses/lgpl-3.0.html
*/
module.exports = POSTagger;
function POSTagger(){
this.lexicon = require('./lexicon');
}
/**
* Indicates whether or not this string starts with the specified string.
* @param {Object} string
*/
function startsWith($this, string){
if (!string)
return false;
return $this.indexOf(string) == 0;
}
/**
* Indicates whether or not this string ends with the specified string.
* @param {Object} string
*/
function endsWith($this, string){
if (!string || string.length > $this.length)
return false;
return $this.indexOf(string) == $this.length - string.length;
}
POSTagger.prototype.wordInLexicon = function(word){
var ss = this.lexicon[word];
if (ss != null)
return true;
// 1/22/2002 mod (from Lisp code): if not in hash, try lower case:
if (!ss)
ss = this.lexicon[word.toLowerCase()];
if (ss)
return true;
return false;
}
POSTagger.prototype.tag = function(words){
var ret = new Array(words.length);
for (var i = 0, size = words.length; i < size; i++) {
var ss = this.lexicon[words[i]];
// 1/22/2002 mod (from Lisp code): if not in hash, try lower case:
if (!ss)
ss = this.lexicon[words[i].toLowerCase()];
if (!ss && words[i].length == 1)
ret[i] = words[i] + "^";
if (!ss)
ret[i] = "NN";
else
ret[i] = ss[0];
}
/**
* Apply transformational rules
**/
for (var i = 0; i < words.length; i++) {
word = ret[i];
// rule 1: DT, {VBD | VBP} --> DT, NN
if (i > 0 && ret[i - 1] == "DT") {
if (word == "VBD" ||
word == "VBP" ||
word == "VB") {
ret[i] = "NN";
}
}
// rule 2: convert a noun to a number (CD) if "." appears in the word
if (startsWith(word, "N")) {
if (words[i].indexOf(".") > -1) {
// url if there are two contiguous alpha characters
if (/[a-zA-Z]{2}/.test(words[i]))
ret[i] = "URL";
else
ret[i] = "CD";
}
// Attempt to convert into a number
if (parseFloat(words[i]))
ret[i] = "CD";
}
// rule 3: convert a noun to a past participle if words[i] ends with "ed"
if (startsWith(ret[i], "N") && endsWith(words[i], "ed"))
ret[i] = "VBN";
// rule 4: convert any type to adverb if it ends in "ly";
if (endsWith(words[i], "ly"))
ret[i] = "RB";
// rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
// bug: this applies also to NNP NNPS
// bug: it said here: endsWith(word, "al"), this should be: endsWith(words[i], "al")
//if (startsWith(ret[i], "NN") && endsWith(word, "al"))
if (((ret[i] === "NN") || (ret[i] === "NNS")) && endsWith(words[i], "al"))
//ret[i] = i, "JJ";
ret[i] = "JJ";
// rule 6: convert a noun to a verb if the preceding work is "would"
if (i > 0 && startsWith(ret[i], "NN") && words[i - 1].toLowerCase() == "would")
ret[i] = "VB";
// rule 7: if a word has been categorized as a common noun and it ends with "s",
// then set its type to plural common noun (NNS)
if (ret[i] == "NN" && endsWith(words[i], "s"))
ret[i] = "NNS";
// rule 8: convert a common noun to a present participle verb (i.e., a gerund)
if (startsWith(ret[i], "NN") && endsWith(words[i], "ing"))
ret[i] = "VBG";
}
var result = new Array();
for (i in words) {
result[i] = [words[i], ret[i]];
}
return result;
}
POSTagger.prototype.prettyPrint = function(taggedWords) {
for (i in taggedWords) {
print(taggedWords[i][0] + "(" + taggedWords[i][1] + ")");
}
}
//print(new POSTagger().tag(["i", "went", "to", "the", "store", "to", "buy", "5.2", "gallons", "of", "milk"]));