Skip to content

Commit

Permalink
Extract handling of Japanese in SpellingVdalitor as JapaneseExpressio…
Browse files Browse the repository at this point in the history
…nVariationValidator
  • Loading branch information
takahi-i committed Oct 4, 2017
1 parent bd48415 commit 5b89610
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 55 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* redpen: a text inspection tool
* Copyright (c) 2014-2015 Recruit Technologies Co., Ltd. and contributors
* (see CONTRIBUTORS.md)
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cc.redpen.validator.sentence;

import cc.redpen.model.Sentence;
import cc.redpen.tokenizer.TokenElement;
import cc.redpen.validator.Validator;

import java.util.*;

import static java.util.Collections.singletonList;

public class JapaneseExpressionVariationValidator extends Validator {
private Map<String, List<TokenElement>> words = new HashMap<>();

@Override
public void validate(Sentence sentence) {
for (TokenElement token : sentence.getTokens()) {
String reading = token.getTags().get(7);
if (this.words.containsKey(reading)) {
List<TokenElement> tokens = this.words.get(reading);
for (TokenElement candidate : tokens) {
if (candidate != token && !token.getSurface().equals(candidate.getSurface())) {
addLocalizedErrorFromToken(sentence, token);
}
}
}
}
}

@Override
public void preValidate(Sentence sentence) {
for (TokenElement token : sentence.getTokens()) {
String reading = token.getTags().get(7);
if (!this.words.containsKey(reading)) {
this.words.put(reading, new LinkedList<TokenElement>());
}
this.words.get(reading).add(token);

}
}

@Override
public List<String> getSupportedLanguages() {
return singletonList(Locale.JAPANESE.getLanguage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import cc.redpen.tokenizer.TokenElement;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

Expand All @@ -31,29 +30,6 @@ public final class SpellingValidator extends SpellingDictionaryValidator {

@Override
public void validate(Sentence sentence) {
String lang = getSymbolTable().getLang();
if (lang.equals("ja")) {
validate_ja(sentence);
} else if (lang.equals("en")) {
validate_en(sentence);
}
}

private void validate_ja(Sentence sentence) {
for (TokenElement token : sentence.getTokens()) {
String reading = token.getTags().get(7);
if (this.words.containsKey(reading)) {
List<TokenElement> tokens = this.words.get(reading);
for (TokenElement candidate : tokens) {
if (candidate != token && !token.getSurface().equals(candidate.getSurface())) {
addLocalizedErrorFromToken(sentence, token);
}
}
}
}
}

private void validate_en(Sentence sentence) {
for (TokenElement token : sentence.getTokens()) {
String surface = token.getSurface().toLowerCase();
if (surface.length() == 0 || surface.matches("\\P{L}+")) continue;
Expand All @@ -63,21 +39,4 @@ private void validate_en(Sentence sentence) {
}
}
}

@Override
public void preValidate(Sentence sentence) {
String lang = getSymbolTable().getLang();
if (!lang.equals("ja")) {
return;
}

for (TokenElement token : sentence.getTokens()) {
String reading = token.getTags().get(7);
if (!this.words.containsKey(reading)) {
this.words.put(reading, new LinkedList<TokenElement>());
}
this.words.get(reading).add(token);

}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ JapaneseBrokenExpressionValidator=Found broken japanese expression (missing "Ra"
JapaneseJoyoKanjiValidator=Found non-joyo kanji: "{0}"
DoubledConjunctiveParticleGaValidator=Found multiple conjunctive particle: "{0}"
SuccessiveSentenceValidator=Found similar two sentences in succession: "{0}" and "{1}"
JapaneseExpressionVariationValidator=Found possible Japanese word variations: "{0}" and "{1}"
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ JapaneseBrokenExpressionValidator=\u3089\u629C\u304D\u8868\u73FE\u3092\u4F7F\u75
JapaneseJoyoKanjiValidator=\u5E38\u7528\u6F22\u5B57\u3067\u306A\u3044\u6F22\u5B57\uFF08"{0}"\uFF09\u304C\u4F7F\u7528\u3055\u308C\u3066\u3044\u307E\u3059\u3002
DoubledConjunctiveParticleGaValidator=\u4E00\u6587\u306B\u9006\u8AAC\u306E\u63A5\u7D9A\u52A9\u8A5E "{0}" \u304C\u8907\u6570\u56DE\u4F7F\u7528\u3055\u308C\u3066\u3044\u307E\u3059\u3002
SuccessiveSentenceValidator=\u985E\u4F3C\u3059\u308B\u6587\u304C\u4E8C\u3064\u7D9A\u3051\u3066\u4F7F\u7528\u3055\u308C\u3066\u3044\u307E\u3059: "{0}"\u3001"{1}"
JapaneseExpressionVariationValidator=\u5358\u8A9E\u306E\u63FA\u3089\u304E\u306E\u53EF\u80FD\u6027\u304C\u3042\u308B\u8868\u73FE\u3001 \u201D{0}\u201D \u3001\u201D{1}\u201D\u304C\u898B\u3064\u304B\u308A\u307E\u3057\u305F\u3002
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* redpen: a text inspection tool
* Copyright (c) 2014-2015 Recruit Technologies Co., Ltd. and contributors
* (see CONTRIBUTORS.md)
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cc.redpen.validator.sentence;

import cc.redpen.RedPen;
import cc.redpen.RedPenException;
import cc.redpen.config.Configuration;
import cc.redpen.config.ValidatorConfiguration;
import cc.redpen.model.Document;
import cc.redpen.validator.BaseValidatorTest;
import cc.redpen.validator.ValidationError;
import org.junit.jupiter.api.Test;

import java.util.List;
import java.util.Map;

import static java.util.Collections.singletonList;
import static org.junit.jupiter.api.Assertions.assertEquals;

public class JapaneseExpressionVariationValidatorTest extends BaseValidatorTest {
protected JapaneseExpressionVariationValidatorTest() {
super("JapaneseExpressionVariation");
}

@Test
void japanese() throws RedPenException {
config = Configuration.builder("ja")
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();

Document document = prepareSimpleDocument("之は山です。これは川です。");

RedPen redPen = new RedPen(config);
Map<Document, List<ValidationError>> errors = redPen.validate(singletonList(document));
assertEquals(2, errors.get(document).size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,7 @@ void nonLatin() throws RedPenException {

RedPen redPen = new RedPen(config);
Map<Document, List<ValidationError>> errors = redPen.validate(singletonList(document));
assertEquals(0, errors.get(document).size());
}

@Test
void japanese() throws RedPenException {
config = Configuration.builder("ja")
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();

Document document = prepareSimpleDocument("之は山です。これは川です。");

RedPen redPen = new RedPen(config);
Map<Document, List<ValidationError>> errors = redPen.validate(singletonList(document));
assertEquals(2, errors.get(document).size());
assertEquals(1, errors.get(document).size());
}

@Test
Expand Down

0 comments on commit 5b89610

Please sign in to comment.