Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: japanese parenthesis #8381

Merged
merged 9 commits into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,23 @@ my $commas = qr/(?:\N{U+002C}|\N{U+FE50}|\N{U+FF0C}|\N{U+3001}|\N{U+FE51}|\N{U+F
my $stops = qr/(?:\N{U+002E}|\N{U+FE52}|\N{U+FF0E}|\N{U+3002}|\N{U+FE61})/i;

# '(' and other opening brackets ('Punctuation, Open' without QUOTEs)
# U+201A "‚" (Single Low-9 Quotation Mark)
benbenben2 marked this conversation as resolved.
Show resolved Hide resolved
# U+201E "„" (Double Low-9 Quotation Mark)
# U+276E "❮" (Heavy Left-Pointing Angle Quotation Mark Ornament)
# U+2E42 "⹂" (Double Low-Reversed-9 Quotation Mark)
# U+301D "〝" (Reversed Double Prime Quotation Mark)
# U+FF08 "(" (Fullwidth Left Parenthesis) used in some countries (Japan)
my $obrackets = qr/(?![\N{U+201A}|\N{U+201E}|\N{U+276E}|\N{U+2E42}|\N{U+301D}|\N{U+FF08}])[\p{Ps}]/i;

# ')' and other closing brackets ('Punctuation, Close' without QUOTEs)
# U+276F "❯" (Heavy Right-Pointing Angle Quotation Mark Ornament )
# U+301E "⹂" (Double Low-Reversed-9 Quotation Mark)
# U+301F "〟" (Low Double Prime Quotation Mark)
# U+FF09 ")" (Fullwidth Right Parenthesis) used in some countries (Japan)
my $cbrackets = qr/(?![\N{U+276F}|\N{U+301E}|\N{U+301F}|\N{U+FF09}])[\p{Pe}]/i;

my $separators_except_comma = qr/(;|:|$middle_dot|\[|\{|\(|( $dashes ))|(\/)/i
# U+FF0F "/" (Fullwidth Solidus) used in some countries (Japan)
my $separators_except_comma = qr/(;|:|$middle_dot|\[|\{|\(|\N{U+FF08}|( $dashes ))|(\/|\N{U+FF0F})/i
; # separators include the dot . followed by a space, but we don't want to separate 1.4 etc.

my $separators = qr/($stops\s|$commas|$separators_except_comma)/i;
Expand Down Expand Up @@ -1268,8 +1280,7 @@ sub parse_ingredients_text ($product_ref) {
my $processing = '';

$debug_ingredients and $log->debug("analyze_ingredients_function", {string => $s}) if $log->is_debug();

# find the first separator or ( or [ or :
# find the first separator or ( or [ or : etc.
if ($s =~ $separators) {

$before = $`;
Expand All @@ -1283,7 +1294,7 @@ sub parse_ingredients_text ($product_ref) {

# If the first separator is a column : or a start of parenthesis etc. we may have sub ingredients

if ($sep =~ /(:|\[|\{|\()/i) {
if ($sep =~ /(:|\[|\{|\(|\N{U+FF08})/i) {

# Single separators like commas and dashes
my $match = '.*?'; # non greedy match
Expand All @@ -1305,6 +1316,10 @@ sub parse_ingredients_text ($product_ref) {
elsif ($sep eq '{') {
$ending = '\}';
}
# brackets type used in some countries (Japan) "(" and ")"
elsif ($sep =~ '\N{U+FF08}') {
$ending = '\N{U+FF09}';
}

$ending = '(' . $ending . ')';

Expand Down
242 changes: 242 additions & 0 deletions tests/unit/expected_test_results/ingredients/ja-parenthesis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"ingredients" : [
{
"id" : "en:soy-sauce",
"ingredients" : [
{
"id" : "ja:本醸造",
"percent_estimate" : 55,
"percent_max" : 100,
"percent_min" : 10,
"text" : "本醸造"
}
],
"percent_estimate" : 55,
"percent_max" : 100,
"percent_min" : 10,
"text" : "しょうゆ",
"vegan" : "ignore",
"vegetarian" : "ignore"
},
{
"id" : "ja:糖類",
"ingredients" : [
{
"id" : "ja:ぶどう糖果糖液糖",
"percent_estimate" : 11.25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "ぶどう糖果糖液糖"
},
{
"id" : "ja:水あめ",
"percent_estimate" : 5.625,
"percent_max" : 25,
"percent_min" : 0,
"text" : "水あめ"
},
{
"id" : "en:sugar",
"percent_estimate" : 5.625,
"percent_max" : 16.6666666666667,
"percent_min" : 0,
"text" : "砂糖",
"vegan" : "yes",
"vegetarian" : "yes"
}
],
"percent_estimate" : 22.5,
"percent_max" : 50,
"percent_min" : 0,
"text" : "糖類"
},
{
"id" : "en:mirin",
"percent_estimate" : 11.25,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"text" : "みりん",
"vegan" : "ignore",
"vegetarian" : "ignore"
},
{
"id" : "en:salt",
"percent_estimate" : 5.625,
"percent_max" : 25,
"percent_min" : 0,
"text" : "食塩",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "ja:かつお節",
"percent_estimate" : 2.8125,
"percent_max" : 20,
"percent_min" : 0,
"text" : "かつお節"
},
{
"id" : "ja:さば節",
"percent_estimate" : 1.40625,
"percent_max" : 16.6666666666667,
"percent_min" : 0,
"text" : "さば節"
},
{
"id" : "ja:たん白加水分解物混合物",
"percent_estimate" : 0.703125,
"percent_max" : 14.2857142857143,
"percent_min" : 0,
"text" : "たん白加水分解物混合物"
},
{
"id" : "ja:こんぶ",
"percent_estimate" : 0.3515625,
"percent_max" : 12.5,
"percent_min" : 0,
"text" : "こんぶ"
},
{
"id" : "en:condiment",
"ingredients" : [
{
"id" : "ja:アミノ酸等",
"percent_estimate" : 0.17578125,
"percent_max" : 11.1111111111111,
"percent_min" : 0,
"text" : "アミノ酸等"
}
],
"percent_estimate" : 0.17578125,
"percent_max" : 11.1111111111111,
"percent_min" : 0,
"text" : "調味料",
"vegan" : "ignore",
"vegetarian" : "ignore"
},
{
"id" : "en:alcohol",
"percent_estimate" : 0.17578125,
"percent_max" : 10,
"percent_min" : 0,
"text" : "アルコール",
"vegan" : "maybe",
"vegetarian" : "yes"
}
],
"ingredients_analysis" : {
"en:palm-oil-content-unknown" : [
"ja:本醸造",
"ja:糖類",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"ja:アミノ酸等"
],
"en:vegan-status-unknown" : [
"ja:本醸造",
"ja:糖類",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"ja:アミノ酸等"
],
"en:vegetarian-status-unknown" : [
"ja:本醸造",
"ja:糖類",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"ja:アミノ酸等"
]
},
"ingredients_analysis_tags" : [
"en:palm-oil-content-unknown",
"en:vegan-status-unknown",
"en:vegetarian-status-unknown"
],
"ingredients_hierarchy" : [
"en:soy-sauce",
"en:sauce",
"ja:糖類",
"en:mirin",
"en:salt",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"en:condiment",
"en:alcohol",
"ja:本醸造",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"en:sugar",
"en:added-sugar",
"en:disaccharide",
"ja:アミノ酸等"
],
"ingredients_n" : 15,
"ingredients_n_tags" : [
"15",
"11-20"
],
"ingredients_original_tags" : [
"en:soy-sauce",
"ja:糖類",
"en:mirin",
"en:salt",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"en:condiment",
"en:alcohol",
"ja:本醸造",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"en:sugar",
"ja:アミノ酸等"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:soy-sauce",
"en:sauce",
"ja:糖類",
"en:mirin",
"en:salt",
"ja:かつお節",
"ja:さば節",
"ja:たん白加水分解物混合物",
"ja:こんぶ",
"en:condiment",
"en:alcohol",
"ja:本醸造",
"ja:ぶどう糖果糖液糖",
"ja:水あめ",
"en:sugar",
"en:added-sugar",
"en:disaccharide",
"ja:アミノ酸等"
],
"ingredients_text" : "しょうゆ(本醸造)、糖類(ぶどう糖果糖液糖、水あめ、砂糖)、みりん、食塩、かつお節、さば節、たん白加水分解物混合物、こんぶ、調味料(アミノ酸等)、アルコール",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 12,
"ingredients_with_unspecified_percent_sum" : 100,
"known_ingredients_n" : 9,
"lc" : "ja",
"nutriments" : {
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0
},
"unknown_ingredients_n" : 9
}
Loading