openfoodfacts · stephanegigandet · Aug 17, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
@@ -161,6 +161,10 @@ BEGIN {
 
 		&cmp_taxonomy_tags_alphabetically
 
+		&cached_display_taxonomy_tag
+		$cached_display_taxonomy_tag_calls
+		$cached_display_taxonomy_tag_misses
+
 	);    # symbols to export on request
 	%EXPORT_TAGS = (all => [@EXPORT_OK]);
 }
@@ -3851,6 +3855,43 @@ sub exists_taxonomy_tag ($tagtype, $tagid) {
 			and not((exists $just_synonyms{$tagtype}) and (exists $just_synonyms{$tagtype}{$tagid})));
 }
 
+=head2 cached_display_taxonomy_tag ( $target_lc, $tagtype, $canon_tagid )
+
+Return the name of a tag for displaying it to the user.
+This function builds a cache of the resulting names, in order to reduce execution time.
+The cache is an ever-growing hash of input parameters.
+This function should only be used in batch scripts, and not in code called from the Apache mod_perl processes.
+
+=head3 Arguments
+
+=head4 $target_lc - target language code
+
+=head4 $tagtype
+
+=head4 $canon_tagid
+
+=head3 Return values
+
+The tag translation if it exists in target language,
+otherwise, the tag id.
+
+=cut
+
+my %cached_display_taxonomy_tags = ();
+$cached_display_taxonomy_tag_calls = 0;
+$cached_display_taxonomy_tag_misses = 0;
+
+sub cached_display_taxonomy_tag ($target_lc, $tagtype, $tag) {
+	$cached_display_taxonomy_tag_calls++;
+	my $key = $target_lc . ':' . $tagtype . ':' . $tag;
+	return $cached_display_taxonomy_tags{$key} if exists $cached_display_taxonomy_tags{$key};
+
+	$cached_display_taxonomy_tag_misses++;
+	my $value = display_taxonomy_tag($target_lc, $tagtype, $tag);
+	$cached_display_taxonomy_tags{$key} = $value;
+	return $value;
+}
+
 =head2 display_taxonomy_tag ( $target_lc, $tagtype, $canon_tagid )
 
 Return the name of a tag for displaying it to the user

@@ -124,6 +124,7 @@ sub sanitize_field_content {
 	}
 }
 
+$fields_ref->{empty} = 1;
 $fields_ref->{nutriments} = 1;
 $fields_ref->{ingredients} = 1;
 $fields_ref->{images} = 1;
@@ -280,17 +281,19 @@ sub sanitize_field_content {
 
 		# 300 000 ms timeout so that we can export the whole database
 		# 5mins is not enough, 50k docs were exported
-		my $cursor = $collection->query(
-			{
-				'code' => {"\$ne" => ""},
-				'empty' => {"\$ne" => 1}
-			}
-		)->fields($fields_ref)->sort({code => 1});
+		# Removed sort({code => 1} in order to speed up the MongoDB query and not run into the error
+		# "MongoDB::DatabaseError: Executor error during find command :: caused by :: Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting."
+		my $cursor = $collection->query()->fields($fields_ref);
 
 		$cursor->immortal(1);
 
 		while (my $product_ref = $cursor->next) {
 
+			# Skip empty products and products without code
+			# We filter them here instead of in the query
+			next if not $product_ref->{code};
+			next if $product_ref->{empty};
+
 			my $csv = '';
 			my $url = "http://world-$lc.$server_domain" . product_url($product_ref);
 			my $code = ($product_ref->{code} // '');
@@ -372,9 +375,8 @@ sub sanitize_field_content {
 				}
 				if (defined $taxonomy_fields{$field}) {
 					if (defined $product_ref->{$field . '_tags'}) {
-						$csv
-							.= join(',',
-							map {display_taxonomy_tag($lc, $field, $_)} @{$product_ref->{$field . '_tags'}})
+						$csv .= join(',',
+							map {cached_display_taxonomy_tag($lc, $field, $_)} @{$product_ref->{$field . '_tags'}})
 							. "\t";
 					}
 					else {
@@ -412,7 +414,7 @@ sub sanitize_field_content {
 				$main_cid = $product_ref->{categories_tags}[(scalar @{$product_ref->{categories_tags}}) - 1];
 
 				$main_cid = canonicalize_tag2("categories", $main_cid);
-				$main_cid_lc = display_taxonomy_tag($lc, 'categories', $main_cid);
+				$main_cid_lc = cached_display_taxonomy_tag($lc, 'categories', $main_cid);
 			}
 
 			$csv .= $main_cid . "\t";
@@ -518,14 +520,18 @@ sub sanitize_field_content {
 
 	# only overwrite previous dump if the new one is bigger, to reduce failed runs breaking the dump.
 	my $csv_size_old = (-s $csv_filename) // 0;
-	my $csv_size_new = (-s "$csv_filename.temp") // 0;
+	# Sort lines by code, except header line
+	system("(head -1 $csv_filename.temp && (tail -n +2 $csv_filename.temp | sort)) > $csv_filename.temp2");
+	unlink "$csv_filename.temp";
+	my $csv_size_new = (-s "$csv_filename.temp2") // 0;
+	# guard: we replace target file only if it's big enough (to avoid replacing valid export by a broken one)
 	if ($csv_size_new >= $csv_size_old * 0.99) {
 		unlink $csv_filename;
-		rename "$csv_filename.temp", $csv_filename;
+		rename "$csv_filename.temp2", $csv_filename;
 	}
 	else {
 		print STDERR "Not overwriting previous CSV. Old size = $csv_size_old, new size = $csv_size_new.\n";
-		unlink "$csv_filename.temp";
+		unlink "$csv_filename.temp2";
 	}
 
 	my %links = ();