Fix build failures

redpen-cc · Oct 15, 2017 · f890b68 · f890b68
1 parent 3aa65a9
commit f890b68
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 40 deletions.
diff --git a/redpen-core/src/main/java/cc/redpen/tokenizer/NeologdJapaneseTokenizer.java b/redpen-core/src/main/java/cc/redpen/tokenizer/NeologdJapaneseTokenizer.java
@@ -17,39 +17,33 @@
  */
 package cc.redpen.tokenizer;
 
-import com.atilika.kuromoji.unidic.Tokenizer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.InflectionAttribute;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
 import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;
 
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 
 public class NeologdJapaneseTokenizer implements RedPenTokenizer {
 
-    private Tokenizer tokenizer;
-    private JapaneseAnalyzer analyzer;
+    private JapaneseTokenizer tokenizer;
 
     public NeologdJapaneseTokenizer() {
-        this.tokenizer = new Tokenizer();
-        this.analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.DEFAULT_MODE, null, new HashSet<String>());
+        this.tokenizer = new JapaneseTokenizer(new StringReader(""), null, false, JapaneseTokenizer.Mode.NORMAL);
     }
 
     @Override
     public List<TokenElement> tokenize(String content) {
         List<TokenElement> tokens = new ArrayList<>();
         try {
             for (TokenElement token : kuromojineologd(content)) {
-                System.out.println(token);
                 tokens.add(token);
             }
         } catch (IOException e) {
@@ -59,26 +53,34 @@ public List<TokenElement> tokenize(String content) {
     }
 
     private List<TokenElement> kuromojineologd(String src) throws IOException {
+        tokenizer.setReader(new StringReader(src));
         List<TokenElement> tokens = new ArrayList<>();
-        try (TokenStream tokenStream = analyzer.tokenStream("", new StringReader(src))) {
-            BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
-            CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
-            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
-            ReadingAttribute readAttr = tokenStream.addAttribute(ReadingAttribute.class);
-            OffsetAttribute offsetAttr  = tokenStream.addAttribute(OffsetAttribute.class);
-
-            tokenStream.reset();
-            int offset = 0;
-            while (tokenStream.incrementToken()) {
-                String surface = charAttr.toString();
-                tokens.add(new TokenElement(surface,
-                        Arrays.asList(posAttr.getPartOfSpeech().split("-")),
-                        offsetAttr.startOffset(),
-                        readAttr.getReading()
-                        ));
-                offset += surface.length();
-            }
+        BaseFormAttribute baseAttr = tokenizer.addAttribute(BaseFormAttribute.class);
+        CharTermAttribute charAttr = tokenizer.addAttribute(CharTermAttribute.class);
+        PartOfSpeechAttribute posAttr = tokenizer.addAttribute(PartOfSpeechAttribute.class);
+        ReadingAttribute readAttr = tokenizer.addAttribute(ReadingAttribute.class);
+        OffsetAttribute offsetAttr  = tokenizer.addAttribute(OffsetAttribute.class);
+        InflectionAttribute inflectionAttr = tokenizer.addAttribute(InflectionAttribute.class);
+        tokenizer.reset();
+        while (tokenizer.incrementToken()) {
+            String surface = charAttr.toString();
+            tokens.add(new TokenElement(surface,
+                    getTagList(posAttr, inflectionAttr),
+                    offsetAttr.startOffset(),
+                    readAttr.getReading()
+            ));
         }
+        tokenizer.close();
         return tokens;
     }
+
+    private List<String> getTagList(PartOfSpeechAttribute posAttr, InflectionAttribute inflectionAttr) {
+        List<String> posList = new ArrayList<>();
+        posList.addAll(Arrays.asList(posAttr.getPartOfSpeech().split("-")));
+        String form = inflectionAttr.getInflectionForm() == null ? "*" : inflectionAttr.getInflectionForm();
+        String type = inflectionAttr.getInflectionType() == null ? "*" : inflectionAttr.getInflectionType();
+        posList.add(type);
+        posList.add(form);
+        return posList;
+    }
 }
diff --git a/...src/main/java/cc/redpen/validator/sentence/JapaneseAmbiguousNounConjunctionValidator.java b/...src/main/java/cc/redpen/validator/sentence/JapaneseAmbiguousNounConjunctionValidator.java
@@ -45,21 +45,21 @@ public void validate(Sentence sentence) {
             switch (stackSize) {
             case 0:
                 if (tags.get(0).equals("名詞")) {
-                    surfaces.add(tags.get(6));
+                    surfaces.add(tokenElement.getSurface());
                     stackSize = 1;
                 }
                 case 1:
-                if (tags.get(0).equals("助詞") && tags.get(6).equals("の")) {
-                    surfaces.add(tags.get(6));
+                if (tags.get(0).equals("助詞") && tokenElement.getSurface().equals("の")) {
+                    surfaces.add(tokenElement.getSurface());
                     stackSize = 2;
                 }
                 break;
             case 2:
                 if (tags.get(0).equals("名詞")) {
-                    surfaces.add(tags.get(6));
+                    surfaces.add(tokenElement.getSurface());
                 } else {
-                    if (tags.get(0).equals("助詞") && tags.get(6).equals("の")) {
-                        surfaces.add(tags.get(6));
+                    if (tags.get(0).equals("助詞") && tokenElement.getSurface().equals("の")) {
+                        surfaces.add(tokenElement.getSurface());
                         stackSize = 3;
                     } else {
                         surfaces.clear();
@@ -69,7 +69,7 @@ public void validate(Sentence sentence) {
                 break;
             case 3:
                 if (tags.get(0).equals("名詞")) {
-                    surfaces.add(tags.get(6));
+                    surfaces.add(tokenElement.getSurface());
                 } else {
                     String surface = String.join("", surfaces);
                     if (!inDictionary(surface)) {

diff --git a/...en-core/src/main/java/cc/redpen/validator/sentence/JapaneseBrokenExpressionValidator.java b/...en-core/src/main/java/cc/redpen/validator/sentence/JapaneseBrokenExpressionValidator.java
@@ -20,12 +20,10 @@
 import cc.redpen.model.Sentence;
 import cc.redpen.tokenizer.TokenElement;
 import cc.redpen.validator.Validator;
-import cc.redpen.util.Pair;
 
-import java.util.*;
-import java.util.regex.Pattern;
+import java.util.List;
+import java.util.Locale;
 
-import static java.util.Collections.emptySet;
 import static java.util.Collections.singletonList;
 
 /**
@@ -41,10 +39,10 @@ public void validate(Sentence sentence) {
         for (int i = 0; i < (tokens.size() - 1); ++i) {
             final TokenElement p = tokens.get(i);
             final List<String> ptags = p.getTags();
-            if (ptags.get(0).equals("動詞") && ptags.get(1).equals("自立") && ptags.get(4).equals("一段") && ptags.get(5).equals("未然形")) {
+            if (ptags.get(0).equals("動詞") && ptags.get(1).equals("自立") && ptags.get(2).equals("一段") && ptags.get(3).equals("未然形")) {
                 final TokenElement q = tokens.get(i+1);
                 final List<String> qtags = q.getTags();
-                if (qtags.get(0).equals("動詞") && qtags.get(1).equals("接尾") && qtags.get(4).equals("一段") && qtags.get(5).equals("未然形") && qtags.get(6).equals("られる")) {
+                if (qtags.get(0).equals("動詞") && qtags.get(1).equals("接尾") && qtags.get(2).equals("一段") && qtags.get(3).equals("未然形") && q.getSurface().equals("られ")) {
                 } else {
                     addLocalizedError(sentence, p.getSurface());
                 }