Skip to content

Commit

Permalink
Fix build failures
Browse files Browse the repository at this point in the history
  • Loading branch information
takahi-i committed Oct 15, 2017
1 parent 3aa65a9 commit f890b68
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,33 @@
*/
package cc.redpen.tokenizer;

import com.atilika.kuromoji.unidic.Tokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseAnalyzer;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.JapaneseTokenizer;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.InflectionAttribute;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.codelibs.neologd.ipadic.lucene.analysis.ja.tokenattributes.ReadingAttribute;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

public class NeologdJapaneseTokenizer implements RedPenTokenizer {

private Tokenizer tokenizer;
private JapaneseAnalyzer analyzer;
private JapaneseTokenizer tokenizer;

public NeologdJapaneseTokenizer() {
this.tokenizer = new Tokenizer();
this.analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.DEFAULT_MODE, null, new HashSet<String>());
this.tokenizer = new JapaneseTokenizer(new StringReader(""), null, false, JapaneseTokenizer.Mode.NORMAL);
}

@Override
public List<TokenElement> tokenize(String content) {
List<TokenElement> tokens = new ArrayList<>();
try {
for (TokenElement token : kuromojineologd(content)) {
System.out.println(token);
tokens.add(token);
}
} catch (IOException e) {
Expand All @@ -59,26 +53,34 @@ public List<TokenElement> tokenize(String content) {
}

private List<TokenElement> kuromojineologd(String src) throws IOException {
tokenizer.setReader(new StringReader(src));
List<TokenElement> tokens = new ArrayList<>();
try (TokenStream tokenStream = analyzer.tokenStream("", new StringReader(src))) {
BaseFormAttribute baseAttr = tokenStream.addAttribute(BaseFormAttribute.class);
CharTermAttribute charAttr = tokenStream.addAttribute(CharTermAttribute.class);
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
ReadingAttribute readAttr = tokenStream.addAttribute(ReadingAttribute.class);
OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);

tokenStream.reset();
int offset = 0;
while (tokenStream.incrementToken()) {
String surface = charAttr.toString();
tokens.add(new TokenElement(surface,
Arrays.asList(posAttr.getPartOfSpeech().split("-")),
offsetAttr.startOffset(),
readAttr.getReading()
));
offset += surface.length();
}
BaseFormAttribute baseAttr = tokenizer.addAttribute(BaseFormAttribute.class);
CharTermAttribute charAttr = tokenizer.addAttribute(CharTermAttribute.class);
PartOfSpeechAttribute posAttr = tokenizer.addAttribute(PartOfSpeechAttribute.class);
ReadingAttribute readAttr = tokenizer.addAttribute(ReadingAttribute.class);
OffsetAttribute offsetAttr = tokenizer.addAttribute(OffsetAttribute.class);
InflectionAttribute inflectionAttr = tokenizer.addAttribute(InflectionAttribute.class);
tokenizer.reset();
while (tokenizer.incrementToken()) {
String surface = charAttr.toString();
tokens.add(new TokenElement(surface,
getTagList(posAttr, inflectionAttr),
offsetAttr.startOffset(),
readAttr.getReading()
));
}
tokenizer.close();
return tokens;
}

private List<String> getTagList(PartOfSpeechAttribute posAttr, InflectionAttribute inflectionAttr) {
List<String> posList = new ArrayList<>();
posList.addAll(Arrays.asList(posAttr.getPartOfSpeech().split("-")));
String form = inflectionAttr.getInflectionForm() == null ? "*" : inflectionAttr.getInflectionForm();
String type = inflectionAttr.getInflectionType() == null ? "*" : inflectionAttr.getInflectionType();
posList.add(type);
posList.add(form);
return posList;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,21 @@ public void validate(Sentence sentence) {
switch (stackSize) {
case 0:
if (tags.get(0).equals("名詞")) {
surfaces.add(tags.get(6));
surfaces.add(tokenElement.getSurface());
stackSize = 1;
}
case 1:
if (tags.get(0).equals("助詞") && tags.get(6).equals("の")) {
surfaces.add(tags.get(6));
if (tags.get(0).equals("助詞") && tokenElement.getSurface().equals("の")) {
surfaces.add(tokenElement.getSurface());
stackSize = 2;
}
break;
case 2:
if (tags.get(0).equals("名詞")) {
surfaces.add(tags.get(6));
surfaces.add(tokenElement.getSurface());
} else {
if (tags.get(0).equals("助詞") && tags.get(6).equals("の")) {
surfaces.add(tags.get(6));
if (tags.get(0).equals("助詞") && tokenElement.getSurface().equals("の")) {
surfaces.add(tokenElement.getSurface());
stackSize = 3;
} else {
surfaces.clear();
Expand All @@ -69,7 +69,7 @@ public void validate(Sentence sentence) {
break;
case 3:
if (tags.get(0).equals("名詞")) {
surfaces.add(tags.get(6));
surfaces.add(tokenElement.getSurface());
} else {
String surface = String.join("", surfaces);
if (!inDictionary(surface)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@
import cc.redpen.model.Sentence;
import cc.redpen.tokenizer.TokenElement;
import cc.redpen.validator.Validator;
import cc.redpen.util.Pair;

import java.util.*;
import java.util.regex.Pattern;
import java.util.List;
import java.util.Locale;

import static java.util.Collections.emptySet;
import static java.util.Collections.singletonList;

/**
Expand All @@ -41,10 +39,10 @@ public void validate(Sentence sentence) {
for (int i = 0; i < (tokens.size() - 1); ++i) {
final TokenElement p = tokens.get(i);
final List<String> ptags = p.getTags();
if (ptags.get(0).equals("動詞") && ptags.get(1).equals("自立") && ptags.get(4).equals("一段") && ptags.get(5).equals("未然形")) {
if (ptags.get(0).equals("動詞") && ptags.get(1).equals("自立") && ptags.get(2).equals("一段") && ptags.get(3).equals("未然形")) {
final TokenElement q = tokens.get(i+1);
final List<String> qtags = q.getTags();
if (qtags.get(0).equals("動詞") && qtags.get(1).equals("接尾") && qtags.get(4).equals("一段") && qtags.get(5).equals("未然形") && qtags.get(6).equals("られる")) {
if (qtags.get(0).equals("動詞") && qtags.get(1).equals("接尾") && qtags.get(2).equals("一段") && qtags.get(3).equals("未然形") && q.getSurface().equals("られ")) {
} else {
addLocalizedError(sentence, p.getSurface());
}
Expand Down

0 comments on commit f890b68

Please sign in to comment.