redpen-cc · takahi-i · Oct 26, 2017 · Sep 10, 2017 · Sep 14, 2017 · Oct 4, 2017
diff --git a/redpen-cli/sample/conf/redpen-conf-ja.xml b/redpen-cli/sample/conf/redpen-conf-ja.xml
@@ -11,6 +11,7 @@
         <validator name="SuccessiveWord" />
         <validator name="JapaneseStyle" />
         <validator name="InvalidExpression" />
+        <validator name="JapaneseExpressionVariation" level="Info"/>
         <validator name="DoubleNegative" />
         <validator name="Okurigana"/>
         <validator name="JapaneseNumberExpression"/>

diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.asciidoc b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.asciidoc
@@ -3,7 +3,7 @@
 = 分散処理
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、
 このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり，高負荷な状況に対処できたりします。
-本稿では,複数の計算機（クラスタ）でで動作する各サーバーを_「インスタンス」_と呼びまます。
+本稿では,複数の計算機（Cluster）でで動作する各サーバーを_「インスタンス」_と呼びまます。
 
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージして_clientプログラム_に渡す機構が必要となります。
+このような場合、クラスター内の各インデクス結果をマージして_clientプログラム_に渡す機構が必要となります。
diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.md b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.md
@@ -1,6 +1,6 @@
 <!-- @suppress SpaceBetweenAlphabeticalWord -->
 # 分散処理
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり、高負荷な状況に対処できたりします。
-本稿では,複数の計算機（クラスタ）でで動作する各サーバーを**インスタンス**と呼びまます。
+本稿では,複数の計算機（Cluster）でで動作する各サーバーを**インスタンス**と呼びまます。
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージしてclientプログラムに渡す機構が必要となります。
+このような場合、クラスター内の各インデクス結果をマージしてclientプログラムに渡す機構が必要となります。
diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.re b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.re
@@ -1,6 +1,6 @@
 #@# @suppress SpaceBetweenAlphabeticalWord
 = 分散処理
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり、
-高負荷な状況に対処できたりします。本稿では,複数の計算機（クラスタ）でで動作する各サーバーを@<strong>{インスタンス}と呼びまます。
+高負荷な状況に対処できたりします。本稿では,複数の計算機（Cluster）でで動作する各サーバーを@<strong>{インスタンス}と呼びまます。
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージしてclientプログラムに渡す機構が必要となります。
+このような場合、クラスター内の各インデクス結果をマージしてclientプログラムに渡す機構が必要となります。
diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.rst b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.rst
@@ -3,6 +3,6 @@
 #######
 
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり、高負荷な状況に対処できたりします。
-本稿では,複数の計算機（クラスタ）でで動作する各サーバーを**インスタンス**と呼びまます。
+本稿では,複数の計算機（Cluster）でで動作する各サーバーを**インスタンス**と呼びまます。
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージしてclientプログラムに渡す機構が必要となります。
+このような場合、クラスター内の各インデクス結果をマージしてclientプログラムに渡す機構が必要となります。
diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.tex b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.tex
@@ -16,9 +16,9 @@
 % @suppress SpaceBetweenAlphabeticalWord
 \section{要約}
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり，高負荷な状況に対処できたりします。
-本稿では,複数の計算機（クラスタ）でで動作する各サーバーを「インスタンス」と呼びまます。
+本稿では,複数の計算機（Cluster）でで動作する各サーバーを「インスタンス」と呼びまます。
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージしてclientプログラムに渡す機構が必要となります。
+このような場合、くらすたー内の各インデクス結果をマージしてclientプログラムに渡す機構が必要となります。
 
 %% \bibliographystyle{plain}
 %% \bibliography{reference,reference-j}

diff --git a/redpen-cli/sample/sample-doc/ja/sampledoc-ja.txt b/redpen-cli/sample/sample-doc/ja/sampledoc-ja.txt
@@ -1,4 +1,4 @@
 最近利用されているソフトウェアの中には複数の計算機上で動作（分散）するものが多く存在し、このような分散ソフトウェアは複数の計算機で動作することで大量のデータを扱えたり，高負荷な状況に対処できたりします。
-本稿では,複数の計算機（クラスタ）でで動作する各サーバーを「インスタンス」と呼びまます。
+本稿では,複数の計算機（Cluster）でで動作する各サーバーを「インスタンス」と呼びまます。
 たとえば検索エンジンやデータベースではインデックスを複数のインスタンスで分割して保持します。
-このような場合、各インデクスの結果をマージしてclientプログラムに渡す機構が必要となります。
+このような場合、クラスター内の各インデクス結果をマージしてclientプログラムに渡す機構が必要となります。
diff --git a/redpen-core/pom.xml b/redpen-core/pom.xml
@@ -168,10 +168,15 @@
     <dependencies>
         <dependency>
             <groupId>com.atilika.kuromoji</groupId>
-            <artifactId>kuromoji-ipadic</artifactId>
+            <artifactId>kuromoji-unidic</artifactId>
             <version>0.9.0</version>
             <type>jar</type>
         </dependency>
+        <dependency>
+            <groupId>org.codelibs</groupId>
+            <artifactId>elasticsearch-analysis-kuromoji-neologd</artifactId>
+            <version>1.6.0</version>
+        </dependency>
         <dependency>
             <groupId>org.junit.jupiter</groupId>
             <artifactId>junit-jupiter-api</artifactId>

diff --git a/redpen-core/src/main/java/cc/redpen/RedPen.java b/redpen-core/src/main/java/cc/redpen/RedPen.java
@@ -229,6 +229,12 @@ private void applyPreprocessorRules(List<Document> documents, Map<Document, List
     }
 
     private void runDocumentValidators(List<Document> documents, Map<Document, List<ValidationError>> docErrorsMap) {
+        // run Document PreProcessors to documents
+        for (Document document : documents) {
+            validators.forEach(e -> e.preValidate(document));
+        }
+
+        // run Section validator to documents
         for (Document document : documents) {
             List<ValidationError> errors = new ArrayList<>();
             validators.forEach(e -> {e.setErrorList(errors); e.validate(document);});

diff --git a/redpen-core/src/main/java/cc/redpen/config/Configuration.java b/redpen-core/src/main/java/cc/redpen/config/Configuration.java
@@ -18,7 +18,7 @@
 package cc.redpen.config;
 
 import cc.redpen.RedPenException;
-import cc.redpen.tokenizer.JapaneseTokenizer;
+import cc.redpen.tokenizer.NeologdJapaneseTokenizer;
 import cc.redpen.tokenizer.RedPenTokenizer;
 import cc.redpen.tokenizer.WhiteSpaceTokenizer;
 import cc.redpen.validator.ValidatorFactory;
@@ -66,7 +66,7 @@ public static List<String> getDefaultConfigKeys() {
     }
 
     private void initTokenizer() {
-        this.tokenizer = lang.equals("ja") ? new JapaneseTokenizer() : new WhiteSpaceTokenizer();
+        this.tokenizer = lang.equals("ja") ? new NeologdJapaneseTokenizer() : new WhiteSpaceTokenizer();
     }
 
     /**

diff --git a/redpen-core/src/main/java/cc/redpen/tokenizer/JapaneseTokenizer.java b/redpen-core/src/main/java/cc/redpen/tokenizer/JapaneseTokenizer.java
diff --git a/redpen-core/src/main/java/cc/redpen/tokenizer/NeologdJapaneseTokenizer.java b/redpen-core/src/main/java/cc/redpen/tokenizer/NeologdJapaneseTokenizer.java
@@ -0,0 +1,86 @@
+/**
+ * redpen: a text inspection tool
+ * Copyright (c) 2014-2015 Recruit Technologies Co., Ltd. and contributors
+ * (see CONTRIBUTORS.md)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package cc.redpen.tokenizer;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.InflectionAttribute;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
+import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class NeologdJapaneseTokenizer implements RedPenTokenizer {
+
+    private JapaneseTokenizer tokenizer;
+
+    public NeologdJapaneseTokenizer() {
+        this.tokenizer = new JapaneseTokenizer(new StringReader(""), null, false, JapaneseTokenizer.Mode.NORMAL);
+    }
+
+    @Override
+    public List<TokenElement> tokenize(String content) {
+        List<TokenElement> tokens = new ArrayList<>();
+        try {
+            for (TokenElement token : kuromojineologd(content)) {
+                tokens.add(token);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return tokens;
+    }
+
+    private List<TokenElement> kuromojineologd(String src) throws IOException {
+        tokenizer.setReader(new StringReader(src));
+        List<TokenElement> tokens = new ArrayList<>();
+        BaseFormAttribute baseAttr = tokenizer.addAttribute(BaseFormAttribute.class);
+        CharTermAttribute charAttr = tokenizer.addAttribute(CharTermAttribute.class);
+        PartOfSpeechAttribute posAttr = tokenizer.addAttribute(PartOfSpeechAttribute.class);
+        ReadingAttribute readAttr = tokenizer.addAttribute(ReadingAttribute.class);
+        OffsetAttribute offsetAttr  = tokenizer.addAttribute(OffsetAttribute.class);
+        InflectionAttribute inflectionAttr = tokenizer.addAttribute(InflectionAttribute.class);
+        tokenizer.reset();
+        while (tokenizer.incrementToken()) {
+            String surface = charAttr.toString();
+            tokens.add(new TokenElement(surface,
+                    getTagList(posAttr, inflectionAttr),
+                    offsetAttr.startOffset(),
+                    readAttr.getReading()
+            ));
+        }
+        tokenizer.close();
+        return tokens;
+    }
+
+    private List<String> getTagList(PartOfSpeechAttribute posAttr, InflectionAttribute inflectionAttr) {
+        List<String> posList = new ArrayList<>();
+        posList.addAll(Arrays.asList(posAttr.getPartOfSpeech().split("-")));
+        String form = inflectionAttr.getInflectionForm() == null ? "*" : inflectionAttr.getInflectionForm();
+        String type = inflectionAttr.getInflectionType() == null ? "*" : inflectionAttr.getInflectionType();
+        posList.add(type);
+        posList.add(form);
+        return posList;
+    }
+}
diff --git a/redpen-core/src/main/java/cc/redpen/tokenizer/TokenElement.java b/redpen-core/src/main/java/cc/redpen/tokenizer/TokenElement.java
@@ -22,7 +22,7 @@
 import java.util.List;
 
 public class TokenElement implements Serializable {
-    private static final long serialVersionUID = -7779529873101010570L;
+    private static final long serialVersionUID = -9055285891555999514L;
 
     // the surface form of the token
     final private String surface;
@@ -33,10 +33,18 @@ public class TokenElement implements Serializable {
     // the character position of the token in the sentence
     final private int offset;
 
-    public TokenElement(String word, List<String> tagList, int offset) {
+    // token reading
+    final private String reading;
+
+    public TokenElement(String word, List<String> tagList, int offset, String reading) {
         surface = word;
         tags = Collections.unmodifiableList(tagList);
         this.offset = offset;
+        this.reading = reading;
+    }
+
+    public TokenElement(String word, List<String> tagList, int offset) {
+        this(word, tagList, offset, word);
     }
 
     public String getSurface() {
@@ -51,33 +59,37 @@ public int getOffset() {
         return offset;
     }
 
+    public String getReading() { return reading; }
+
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
+        if (!(o instanceof TokenElement)) return false;
 
         TokenElement that = (TokenElement) o;
 
-        if (surface != null ? !surface.equals(that.surface) : that.surface != null) return false;
-        if (tags != null ? !tags.equals(that.tags) : that.tags != null) return false;
-
-        return true;
+        if (offset != that.offset) return false;
+        if (!surface.equals(that.surface)) return false;
+        if (!tags.equals(that.tags)) return false;
+        return reading.equals(that.reading);
     }
 
     @Override
     public int hashCode() {
-        int result = surface != null ? surface.hashCode() : 0;
-        result = 31 * result + (tags != null ? tags.hashCode() : 0);
-        result = 64 * result + offset;
+        int result = surface.hashCode();
+        result = 31 * result + tags.hashCode();
+        result = 31 * result + offset;
+        result = 31 * result + reading.hashCode();
         return result;
     }
 
     @Override
     public String toString() {
         return "TokenElement{" +
                 "surface='" + surface + '\'' +
-                ", offset=" + offset +
                 ", tags=" + tags +
+                ", offset=" + offset +
+                ", reading='" + reading + '\'' +
                 '}';
     }
 }
diff --git a/redpen-core/src/main/java/cc/redpen/validator/JavaScriptLoader.java b/redpen-core/src/main/java/cc/redpen/validator/JavaScriptLoader.java
@@ -169,8 +169,8 @@ public void addLocalizedError(String messageKey, Sentence sentenceWithError, Obj
     }
 
     @Override
-    public void addLocalizedErrorFromToken(Sentence sentenceWithError, TokenElement token) {
-        super.addLocalizedErrorFromToken(sentenceWithError, token);
+    public void addLocalizedErrorFromToken(Sentence sentenceWithError, TokenElement token, Object... args) {
+        super.addLocalizedErrorFromToken(sentenceWithError, token, args);
     }
 
     @Override