-
Notifications
You must be signed in to change notification settings - Fork 6
/
ingredient_jac_sim.py
163 lines (123 loc) · 4.69 KB
/
ingredient_jac_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import numpy as np
import json
import pprint as pp
from anytree import RenderTree
from zss import *
from retrieve_recipes import *
from im2recipe_w2v_weight import *
recipes = get_recipes_json('chocochip') #list
def print_tree(tree):
for pre, fill, node in RenderTree(tree):
print("%s%s" % (pre, node.label))
#pairwise Jaccard Sim
from operator import add
import re
import json
import nltk
from NYTtagger.lib.training import utils
from string import punctuation
import pycrfsuite
tagger = pycrfsuite.Tagger()
tagger.open('NYTtagger/tmp/trained_pycrfsuite')
from nltk.tokenize import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
def sent2labels(sent):
return [word[-1] for word in sent]
def sent2features(sent):
return [word[:-1] for word in sent]
def sent2tokens(sent):
return [word[0] for word in sent]
def get_sentence_features(sent):
"""Gets the features of the sentence"""
sent_tokens = list(utils.tokenize(utils.cleanUnicodeFractions(sent)))
sent_features = []
for i, token in enumerate(sent_tokens):
token_features = [token]
token_features.extend(utils.getFeatures(token, i+1, list(sent_tokens)))
sent_features.append(token_features)
return sent_features
def format_ingredient_output(tagger_output, display=False):
"""Formats the tagger output into a more convenient dictionary"""
data = [{}]
display = [[]]
prevTag = None
for token, tag in tagger_output:
# turn B-NAME/123 back into "name"
# tag = re.sub(r'^[BI]\-', "", tag).lower()
# ---- DISPLAY ----
# build a structure which groups each token by its tag, so we can
# rebuild the original display name later.
if prevTag != tag:
display[-1].append((tag, [token]))
prevTag = tag
else:
display[-1][-1][1].append(token)
# ^- token
# ^---- tag
# ^-------- ingredient
# ---- DATA ----
# build a dict grouping tokens by their tag
# initialize this attribute if this is the first token of its kind
if tag not in data[-1]:
data[-1][tag] = []
# HACK: If this token is a unit, singularize it so Scoop accepts it.
if tag == "unit":
token = utils.singularize(token)
data[-1][tag].append(token)
# reassemble the output into a list of dicts.
output = [
dict([(k, utils.smartJoin(tokens)) for k, tokens in ingredient.items()])
for ingredient in data
if len(ingredient)
]
# Add the raw ingredient phrase
for i, v in enumerate(output):
output[i]["input"] = utils.smartJoin(
[" ".join(tokens) for k, tokens in display[i]])
return output
def parse_ingredient(sent):
"""ingredient parsing logic"""
sentence_features = get_sentence_features(sent)
tags = tagger.tag(sentence_features)
tagger_output = zip(sent2tokens(sentence_features), tags)
parsed_ingredient = format_ingredient_output(tagger_output)
if parsed_ingredient:
parsed_ingredient[0]['name'] = parsed_ingredient[0].get('name','').strip('.')
return parsed_ingredient
def parse_recipe_ingredients(ingredient_list):
"""Wrapper around parse_ingredient so we can call it on an ingredient list"""
sentences = tokenizer.tokenize(ingredient_list)
sentences = [sent.strip('\n') for sent in sentences]
ingredients = []
for sent in sentences:
ingredients.extend(parse_ingredient(sent))
return ingredients
ingre_lists = []
for recipe in recipes:
ingre_str = ". ".join(recipe['ingredients'])
parsed_ingre_list = parse_recipe_ingredients(ingre_str)
ingre_store = []
for ingre in parsed_ingre_list:
if not "I-NAME" in ingre:
ingre["I-NAME"] = ""
if not "B-NAME" in ingre:
ingre["B-NAME"] = ""
ingre_text = ingre['B-NAME'] + " "+ ingre["I-NAME"]
ingre_store.append(ingre_text)
#print(ingre_store)
item={"id":recipe['origin_id'], "ingredients": ingre_store}
ingre_lists.append(item)
dim = len(ingre_lists)
ingre_dist_matrix = np.zeros((dim, dim))
from math import*
def jaccard_similarity(x,y):
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
return intersection_cardinality/float(union_cardinality)
for i in range(dim):
for j in range(i+1,dim):
jac_sim = jaccard_similarity(ingre_lists[i]['ingredients'], ingre_lists[j]['ingredients'])
ingre_dist_matrix[i][j]=jac_sim
mat = np.matrix(ingre_dist_matrix)
mat.dump("ingre_dist_matrix.dat")
#mat2 = numpy.load("my_matrix.dat")