Skip to content

Commit

Permalink
feat: 🎸 2021年のアプリからDBとモデルを移植 (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
nikukyugamer authored Jun 17, 2022
1 parent 0d726c4 commit 5ed3678
Show file tree
Hide file tree
Showing 34 changed files with 1,112 additions and 0 deletions.
358 changes: 358 additions & 0 deletions app/models/analyze_syntax.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
require 'nkf'

class AnalyzeSyntax < ApplicationRecord
belongs_to :tweet, optional: true
belongs_to :direct_message, optional: true

def convert_analyze_syntax_response_sentence_objects
hashed_sentences.map do |hashed_sentence|
hashed_sentence.merge!(analyze_syntax_id: id)

# インスタンスが持つ属性が hashed_sentence の keys となる
AnalyzeSyntaxResponse::Sentence.new(hashed_sentence)
end
end

def convert_analyze_syntax_response_token_objects
hashed_tokens.map do |hashed_token|
hashed_token.merge!(analyze_syntax_id: id)

# インスタンスが持つ属性が hashed_token の keys となる
AnalyzeSyntaxResponse::Token.new(hashed_token)
end
end

def hashed_tokens
tokens.map { |token| JSON.parse(token) }
end

def hashed_sentences
sentences.map { |sentence| JSON.parse(sentence) }
end

# rubocop:disable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
# 完全一致でキャラ名一覧と比較するので、最大限に広く word を持つようにしている
def check_words
# TODO: 検知できていない単語例
# 「主人公」(単体で現れると前後関係や他の単語と併せて検討する必要がある)
(
words_with_noun_and_punct_and_noun_tags +
words_with_noun_and_punct_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_punct_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_basic_filters +
words_with_basic_filters.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_basic_filters.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_basic_filters.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_basic_filters.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_affix_tags +
words_with_noun_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_num_and_affix_tags +
words_with_num_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_num_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_num_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_num_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_noun_tags +
words_with_noun_and_noun_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_noun_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_noun_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_noun_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_noun_and_x_tags +
words_with_noun_and_x_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_noun_and_x_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_noun_and_x_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_noun_and_x_tags.map { |word| remove_beginning_unnecesary_strings(word) } +
words_with_affix_and_affix_tags +
words_with_affix_and_affix_tags.map { |word| remove_all_three_point_readers_from_word(word) } +
words_with_affix_and_affix_tags.map { |word| convert_hankaku_katakana_to_zenkaku_katakana(word) } +
words_with_affix_and_affix_tags.map { |word| convert_zenkaku_numbers_to_hankaku_numbers(word) } +
words_with_affix_and_affix_tags.map { |word| remove_beginning_unnecesary_strings(word) }
).uniq.reject(&:empty?)
end
# rubocop:enable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity

private

###################################################################
# NOUN - PUNCT - NOUN という並びのタグの部分を抽出する
# 「ヤム・クー」などを抽出する
###################################################################
def words_with_noun_and_punct_and_noun_tags
words_with_noun_and_punct_and_noun_tags = []
target_start_index_numbers = token_start_index_numbers_with_noun_and_punct_and_noun_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma'] + hashed_tokens[index_number + 2]['lemma']

words_with_noun_and_punct_and_noun_tags << word
end

words_with_noun_and_punct_and_noun_tags
end

def token_start_index_numbers_with_noun_and_punct_and_noun_tags
target_tags = ['NOUN', 'PUNCT', 'NOUN'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 3つの要素の配列の判別をするために、配列の大きさから 2 を引いた index まで調べる
(tags_array.count - 2).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
tags_array[i + 2],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NOUN - X という並びのタグの部分を抽出する
# 「テンガアール」などを抽出する
###################################################################
def words_with_noun_and_x_tags
words_with_noun_and_x_tags = []
target_start_index_numbers = token_start_index_numbers_with_noun_and_x_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_noun_and_x_tags << word
end

words_with_noun_and_x_tags
end

def token_start_index_numbers_with_noun_and_x_tags
target_tags = ['NOUN', 'X'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NOUN - NOUN という並びのタグの部分を抽出する
# 「ルカ様」などを抽出する
###################################################################
def words_with_noun_and_noun_tags
words_with_noun_and_noun_tags = []
target_start_index_numbers = token_start_index_numbers_with_noun_and_noun_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_noun_and_noun_tags << word
end

words_with_noun_and_noun_tags
end

def token_start_index_numbers_with_noun_and_noun_tags
target_tags = ['NOUN', 'NOUN'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NOUN - AFFIX という並びのタグの部分を抽出する
# 「ルカ様」などを抽出する
###################################################################
def words_with_noun_and_affix_tags
words_with_noun_and_affix_tags = []
target_start_index_numbers = token_start_index_numbers_with_noun_and_affix_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_noun_and_affix_tags << word
end

words_with_noun_and_affix_tags
end

def token_start_index_numbers_with_noun_and_affix_tags
target_tags = ['NOUN', 'AFFIX'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NUM - AFFIX という並びのタグの部分を抽出する
# 「4様」などを抽出する
###################################################################
def words_with_num_and_affix_tags
words_with_num_and_affix_tags = []
target_start_index_numbers = token_start_index_numbers_with_num_and_affix_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_num_and_affix_tags << word
end

words_with_num_and_affix_tags
end

def token_start_index_numbers_with_num_and_affix_tags
target_tags = ['NUM', 'AFFIX'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# AFFIX - AFFIX という並びのタグの部分を抽出する
# 特定の文脈における「坊ちゃん」などを抽出する
###################################################################
def words_with_affix_and_affix_tags
words_with_affix_and_affix_tags = []
target_start_index_numbers = token_start_index_numbers_with_affix_and_affix_tags

target_start_index_numbers.each do |index_number|
word = hashed_tokens[index_number]['lemma'] + hashed_tokens[index_number + 1]['lemma']

words_with_affix_and_affix_tags << word
end

words_with_affix_and_affix_tags
end

def token_start_index_numbers_with_affix_and_affix_tags
target_tags = ['AFFIX', 'AFFIX'].freeze
tokens = convert_analyze_syntax_response_token_objects
tags_array = tokens.map(&:tag)
token_start_index_numbers = []

# 2つの要素の配列の判別をするために、配列の大きさから 1 を引いた index まで調べる
(tags_array.count - 1).times.each do |i|
target_array_in_tokens = [
tags_array[i],
tags_array[i + 1],
]

token_start_index_numbers << i if target_array_in_tokens == target_tags
end

token_start_index_numbers
end

###################################################################
# NOUN タグだけに絞ろうとしたが「ベルクート」が VERB だったので例外的に追加
# 「シュウ」が AFFIX だったので追加
###################################################################
def words_with_basic_filters
filtered_tokens = convert_analyze_syntax_response_token_objects.select do |token|
token.tag == 'NOUN' || token.tag == 'VERB' || token.tag == 'AFFIX'
end

filtered_tokens.map(&:lemma)
end

###################################################################
# 以下はヘルパ的メソッドなので、ここに書かなくてもいい
###################################################################

###################################################################
# 得られた単語群から三点リーダを除外した単語群を作る
# 「…ルカ」などで一つの NOUN として認識されてしまうため
###################################################################
def remove_all_three_point_readers_from_word(word)
word.gsub(/…/, '')
end

###################################################################
# 得られた単語群の半角カタカナを全角カタカナに変換する
# 文字列完全一致でキャラ名と照合するため
###################################################################
def convert_hankaku_katakana_to_zenkaku_katakana(word)
NKF.nkf('-WwXm0', word)
end

###################################################################
# 全角英数字を半角英数字へ変換する
# 「2主」などを「2主」などへ統一する
###################################################################
def convert_zenkaku_numbers_to_hankaku_numbers(word)
word.tr('0-9a-zA-Z', '0-9a-zA-Z')
end

###################################################################
# 抽出要素の先頭に不要文字が含まれている場合には削除する
# 「:リオン」などを「リオン」などへ統一する
# 「★ナナミ」などを「ナナミ」などへ統一する
###################################################################
def remove_beginning_unnecesary_strings(word)
# "2:" に対する対応 (id_number: 1396459824892710913)
removed_beginning_unnecesary_strings = word.sub(/\A2/, '')

# "★" に対する対応 (id_number: 1403442321144750081)
removed_beginning_unnecesary_strings = removed_beginning_unnecesary_strings.sub(/\A★/, '')

removed_beginning_unnecesary_strings.sub(/\A:/, '')
end

###################################################################
# 配列の要素をカンマ区切り(ダブルクォート付)へ変換する
###################################################################
def convert_array_to_comma_separated_with_double_quote(array)
array.map { |element| "\"#{element}\"" }.join(",")
end
end
15 changes: 15 additions & 0 deletions app/models/analyze_syntax_response/sentence.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module AnalyzeSyntaxResponse
class Sentence
include ActiveModel::Model

attr_accessor :text, :analyze_syntax_id

def begin_offset
text['beginOffset']
end

def content
text['content']
end
end
end
22 changes: 22 additions & 0 deletions app/models/analyze_syntax_response/token.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module AnalyzeSyntaxResponse
class Token
include ActiveModel::Model

# rubocop:disable Naming/MethodName, Layout/EmptyLinesAroundAttributeAccessor
attr_accessor :text, :partOfSpeech, :dependencyEdge, :lemma, :analyze_syntax_id
# rubocop:enable Naming/MethodName, Layout/EmptyLinesAroundAttributeAccessor

def tag
# 戻り値は Google::Cloud::Language::V1::AnalyzeSyntaxResponse では Symbol だが、これは String である
part_of_speech['tag']
end

def part_of_speech
partOfSpeech
end

def dependency_edge
dependencyEdge
end
end
end
Loading

0 comments on commit 5ed3678

Please sign in to comment.