Skip to content

Commit

Permalink
Support dictionaries (user dictionary and default dictionary)
Browse files Browse the repository at this point in the history
  • Loading branch information
takahi-i committed Oct 26, 2017
1 parent 29fe0ab commit 971bc92
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* redpen: a text inspection tool
* Copyright (c) 2014-2015 Recruit Technologies Co., Ltd. and contributors
* (see CONTRIBUTORS.md)
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cc.redpen.validator;

import cc.redpen.RedPenException;
import cc.redpen.util.DictionaryLoader;

import java.util.HashMap;
import java.util.Map;

import static java.util.Collections.emptyMap;
import static org.apache.commons.lang3.StringUtils.isNotEmpty;

public class KeyValueDictionaryValidator extends Validator {
protected DictionaryLoader<Map<String, String>> loader = KEY_VALUE;
private String dictionaryPrefix;
private Map<String, String> dictionary = emptyMap();

public KeyValueDictionaryValidator() {
super("map", new HashMap<>(), "dict", "");
}

public KeyValueDictionaryValidator(Object...keyValues) {
this();
addDefaultProperties(keyValues);
}

public KeyValueDictionaryValidator(String dictionaryPrefix) {
this();
this.dictionaryPrefix = dictionaryPrefix;
}

@Override
protected void init() throws RedPenException {
if (dictionaryPrefix != null) {
String defaultDictionaryFile = "default-resources/" + dictionaryPrefix + "-" + getSymbolTable().getLang() + ".dat";
dictionary = loader.loadCachedFromResource(defaultDictionaryFile, getClass().getSimpleName() + " default dictionary");
}
String confFile = getString("dict");
if (isNotEmpty(confFile)) {
getMap("map").putAll(loader.loadCachedFromFile(findFile(confFile), getClass().getSimpleName() + " user dictionary"));
}
}

protected boolean inDictionary(String word) {
Map<String, String> customDictionary = getMap("map");
return dictionary.containsKey(word) || customDictionary != null && customDictionary.containsKey(word);
}

protected String getValue(String word) {
Map<String, String> customDictionary = getMap("map");
if (customDictionary != null && customDictionary.containsKey(word)) {
return customDictionary.get(word);
} else if (this.dictionary.containsKey(word)) {
return dictionary.get(word);
}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
import cc.redpen.RedPenException;
import cc.redpen.model.*;
import cc.redpen.tokenizer.TokenElement;
import cc.redpen.validator.Validator;
import cc.redpen.validator.KeyValueDictionaryValidator;

import java.util.*;

import static java.util.Collections.singletonList;

public class JapaneseExpressionVariationValidator extends Validator {
public class JapaneseExpressionVariationValidator extends KeyValueDictionaryValidator {
private Map<Document, Map<String, List<TokenInfo>>> readingMap;
private Map<Document, List<Sentence>> sentenceMap;

Expand All @@ -39,6 +39,10 @@ public TokenInfo(TokenElement element, Sentence sentence) {
public Sentence sentence;
}

public JapaneseExpressionVariationValidator() {
super("japanese-spelling-variation/spelling-variation");
}

@Override
public void validate(Document document) {
if (!sentenceMap.containsKey(document)) {
Expand All @@ -58,7 +62,6 @@ public void validate(Document document) {

private void generateErrors(Document document, Sentence sentence, TokenElement targetToken, String reading) {
Map<String, List<TokenInfo>> variationMap = generateVariationMap(document, targetToken, reading);

for (String surface : variationMap.keySet()) {
List<TokenInfo> variationList = variationMap.get(surface);
String variation = generateErrorMessage(variationList, surface);
Expand Down Expand Up @@ -137,8 +140,12 @@ private void extractTokensFromSentence(Document document, Sentence sentence) {
}

private String getReading(TokenElement token) {
String reading = token.getReading() != null ? token.getReading() : token.getSurface();
return reading.toLowerCase();
String surface = token.getSurface().toLowerCase();
if (inDictionary(surface)) {
return getValue(surface);
}
String reading = token.getReading() != null ? token.getReading() : surface;
return reading;
}

private List<Sentence> extractSentences(Document document) {
Expand All @@ -165,6 +172,7 @@ private List<Sentence> extractSentencesFromSection(Section section) {

@Override
protected void init() throws RedPenException {
super.init();
this.readingMap = new HashMap<>();
this.sentenceMap = new HashMap<>();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
node ノード
log ログ
cluster クラスタ
software ソフトウェア
index インデクス
database データベース
data データ
instance インスタンス
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ protected JapaneseExpressionVariationValidatorTest() {
}

@Test
void detecSameReadingsInJapaneseCharacters() throws RedPenException {
void detectSameReadingsInJapaneseCharacters() throws RedPenException {
config = Configuration.builder("ja")
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();

Document document = prepareSimpleDocument("之は山です。これは川です。");

Expand All @@ -50,6 +50,32 @@ void detecSameReadingsInJapaneseCharacters() throws RedPenException {
assertEquals(1, errors.get(document).size());
}

@Test
void detectSameReadingsInJapaneseCharactersInDefaultDictionary() throws RedPenException {
config = Configuration.builder("ja")
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();

Document document = prepareSimpleDocument("nodeは英語です。ノードはカタカナです。");

RedPen redPen = new RedPen(config);
Map<Document, List<ValidationError>> errors = redPen.validate(singletonList(document));
assertEquals(1, errors.get(document).size());
}

@Test
void detectSameReadingsInJapaneseCharactersInDefaultDictionaryWithUpperCase() throws RedPenException {
config = Configuration.builder("ja")
.addValidatorConfig(new ValidatorConfiguration(validatorName))
.build();

Document document = prepareSimpleDocument("Nodeは英語です。ノードはカタカナです。");

RedPen redPen = new RedPen(config);
Map<Document, List<ValidationError>> errors = redPen.validate(singletonList(document));
assertEquals(1, errors.get(document).size());
}

@Test
void detectSameAlphabecicalReadings() throws RedPenException {
config = Configuration.builder("ja")
Expand Down

0 comments on commit 971bc92

Please sign in to comment.