Merge pull request #121 from true-runes/development

v4.1.0
true-runes · Jun 12, 2021 · 628f41b · 628f41b
2 parents 8a754c8 + e5a4987
commit 628f41b
Show file tree

Hide file tree

Showing 3 changed files with 652 additions and 580 deletions.
diff --git a/app/lib/suikoden_database/pickup_character_names.rb b/app/lib/suikoden_database/pickup_character_names.rb
@@ -27,8 +27,17 @@ def skip_word?(word)
       end
 
       # check_words の中身を連続で見ていった際に、これらの語の場合はキャラ名サーチをしない
+      # ここでは、「check_wordsに現れている語の中で除外する語」を指定する
+      # そもそも check_words に含まれていない語はここではどうしようもない
       def skip_words
         [
+          '票',
+          '/',
+          'm',
+          '(',
+          ')',
+          '_',
+          '__',
           '一',
           'な',
           'る',

diff --git a/app/models/analyze_syntax.rb b/app/models/analyze_syntax.rb
@@ -38,26 +38,37 @@ def check_words
       words_with_noun_and_punct_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_noun_and_punct_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
       words_with_noun_and_punct_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_noun_and_punct_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
       words_with_basic_filters +
       words_with_basic_filters.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_basic_filters.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
       words_with_basic_filters.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_basic_filters.map { |word| remove_beginning_unnecesary_strings(word) } +
       words_with_noun_and_affix_tags +
       words_with_noun_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_noun_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
       words_with_noun_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_noun_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
       words_with_num_and_affix_tags +
       words_with_num_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_num_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
       words_with_num_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_num_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
       words_with_noun_and_noun_tags +
       words_with_noun_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_noun_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
       words_with_noun_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_noun_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
       words_with_noun_and_x_tags +
       words_with_noun_and_x_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
       words_with_noun_and_x_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
-      words_with_noun_and_x_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) }
+      words_with_noun_and_x_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_noun_and_x_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
+      words_with_affix_and_affix_tags +
+      words_with_affix_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
+      words_with_affix_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
+      words_with_affix_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
+      words_with_affix_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) }
     ).uniq.reject(&:empty?)
   end
   # rubocop:enable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
@@ -246,11 +257,48 @@ def token_start_index_numbers_with_num_and_affix_tags
   end
 
   ###################################################################
-  # NOUN タグだけに絞ろうとしたが ベルクート が VERB だったので例外的に追加
+  # AFFIX - AFFIX という並びのタグの部分を抽出する
+  # 特定の文脈における「坊ちゃん」などを抽出する
+  ###################################################################
+  def words_with_affix_and_affix_tags
+    words_with_affix_and_affix_tags = []
+    target_start_index_numbers = token_start_index_numbers_with_affix_and_affix_tags
+
+    target_start_index_numbers.each do |index_number|
+      word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']
+
+      words_with_affix_and_affix_tags << word
+    end
+
+    words_with_affix_and_affix_tags
+  end
+
+  def token_start_index_numbers_with_affix_and_affix_tags
+    target_tags = ['AFFIX', 'AFFIX'].freeze
+    tokens = convert_analyze_syntax_response_token_objects
+    tags_array = tokens.map(&:tag)
+    token_start_index_numbers = []
+
+    # 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
+    (tags_array.count - 1).times.each do |i|
+      target_array_in_tokens = [
+        tags_array[i],
+        tags_array[i + 1],
+      ]
+
+      token_start_index_numbers << i if target_array_in_tokens == target_tags
+    end
+
+    token_start_index_numbers
+  end
+
+  ###################################################################
+  # NOUN タグだけに絞ろうとしたが「ベルクート」が VERB だったので例外的に追加
+  # 「シュウ」が AFFIX だったので追加
   ###################################################################
   def words_with_basic_filters
     filtered_tokens = convert_analyze_syntax_response_token_objects.select do |token|
-      token.tag == 'NOUN' || token.tag == 'VERB'
+      token.tag == 'NOUN' || token.tag == 'VERB' || token.tag == 'AFFIX'
     end
 
     filtered_tokens.map(&:lemma)
@@ -284,6 +332,21 @@ def convert_zenkaku_numbers_to_hankaku_numbers(word)
     word.tr('０-９ａ-ｚＡ-Ｚ', '0-9a-zA-Z')
   end
 
+  ###################################################################
+  # 抽出要素の先頭に不要文字が含まれている場合には削除する
+  # 「：リオン」などを「リオン」などへ統一する
+  # 「★ナナミ」などを「ナナミ」などへ統一する
+  ###################################################################
+  def remove_beginning_unnecesary_strings(word)
+    # "２：" に対する対応 (id_number: 1396459824892710913)
+    removed_beginning_unnecesary_strings = word.sub(/\A２/, '')
+
+    # "★" に対する対応 (id_number: 1403442321144750081)
+    removed_beginning_unnecesary_strings = removed_beginning_unnecesary_strings.sub(/\A★/, '')
+
+    removed_beginning_unnecesary_strings.sub(/\A：/, '')
+  end
+
   ###################################################################
   # 配列の要素をカンマ区切り（ダブルクォート付）へ変換する
   ###################################################################