Skip to content

Commit

Permalink
Merge pull request #121 from true-runes/development
Browse files Browse the repository at this point in the history
v4.1.0
  • Loading branch information
nikukyugamer authored Jun 12, 2021
2 parents 8a754c8 + e5a4987 commit 628f41b
Show file tree
Hide file tree
Showing 3 changed files with 652 additions and 580 deletions.
9 changes: 9 additions & 0 deletions app/lib/suikoden_database/pickup_character_names.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,17 @@ def skip_word?(word)
end

# check_words の中身を連続で見ていった際に、これらの語の場合はキャラ名サーチをしない
# ここでは、「check_wordsに現れている語の中で除外する語」を指定する
# そもそも check_words に含まれていない語はここではどうしようもない
def skip_words
[
'票',
'/',
'm',
'(',
')',
'_',
'__',
'一',
'な',
'る',
Expand Down
69 changes: 66 additions & 3 deletions app/models/analyze_syntax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,37 @@ def check_words
words_with_noun_and_punct_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_basic_filters +
words_with_basic_filters.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_basic_filters.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_basic_filters.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_basic_filters.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_affix_tags +
words_with_noun_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_num_and_affix_tags +
words_with_num_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_num_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_num_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_num_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_noun_tags +
words_with_noun_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_x_tags +
words_with_noun_and_x_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_x_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_x_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) }
words_with_noun_and_x_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_x_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_affix_and_affix_tags +
words_with_affix_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_affix_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_affix_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_affix_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) }
).uniq.reject(&:empty?)
end
# rubocop:enable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
Expand Down Expand Up @@ -246,11 +257,48 @@ def token_start_index_numbers_with_num_and_affix_tags
end

###################################################################
# NOUN タグだけに絞ろうとしたが ベルクート が VERB だったので例外的に追加
# AFFIX - AFFIX という並びのタグの部分を抽出する
# 特定の文脈における「坊ちゃん」などを抽出する
###################################################################
def words_with_affix_and_affix_tags
words_with_affix_and_affix_tags = []
target_start_index_numbers = token_start_index_numbers_with_affix_and_affix_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_affix_and_affix_tags << word
end

words_with_affix_and_affix_tags
end

def token_start_index_numbers_with_affix_and_affix_tags
target_tags = ['AFFIX', 'AFFIX'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NOUN タグだけに絞ろうとしたが「ベルクート」が VERB だったので例外的に追加
# 「シュウ」が AFFIX だったので追加
###################################################################
def words_with_basic_filters
filtered_tokens = convert_analyze_syntax_response_token_objects.select do |token|
token.tag == 'NOUN' || token.tag == 'VERB'
token.tag == 'NOUN' || token.tag == 'VERB' || token.tag == 'AFFIX'
end

filtered_tokens.map(&:lemma)
Expand Down Expand Up @@ -284,6 +332,21 @@ def convert_zenkaku_numbers_to_hankaku_numbers(word)
word.tr('0-9a-zA-Z', '0-9a-zA-Z')
end

###################################################################
# 抽出要素の先頭に不要文字が含まれている場合には削除する
# 「:リオン」などを「リオン」などへ統一する
# 「★ナナミ」などを「ナナミ」などへ統一する
###################################################################
def remove_beginning_unnecesary_strings(word)
# "2:" に対する対応 (id_number: 1396459824892710913)
removed_beginning_unnecesary_strings = word.sub(/\A2/, '')

# "★" に対する対応 (id_number: 1403442321144750081)
removed_beginning_unnecesary_strings = removed_beginning_unnecesary_strings.sub(/\A★/, '')

removed_beginning_unnecesary_strings.sub(/\A:/, '')
end

###################################################################
# 配列の要素をカンマ区切り(ダブルクォート付)へ変換する
###################################################################
Expand Down
Loading

0 comments on commit 628f41b

Please sign in to comment.