feat: add batch evaluation method for pipelines #2942

vblagoje · 2022-08-23T08:20:51Z

Do we need this parameter add_isolated_node_eval? As a user of this API it wasn't clear to me immediately what it is about and why do we need it?

Yes, we need it. It's the same parameter as in the standard run(). If it is set to True, the evaluation is executed with labels as node inputs in addition to the integrated evaluation, where the node inputs are the outputs of the previous node in the pipeline.

vblagoje · 2022-08-23T08:29:01Z

I traced the sas_use_gpu parameter being passed to CrossEncoder via semantic_answer_similarity method. Let's keep in mind that we should soon replace all use_gpu parameter with devices parameter (as per #3062 and #2826) Just to keep in mind as todo item.

-Original file line number
+Diff line change
@@ Expand Up @@
     #### TfidfRetriever.retrieve\_batch
     ```python
-    def retrieve_batch(queries: List[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+    def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
     ```
     Scan through documents in DocumentStore and return a small number documents
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -146,7 +146,7 @@ def _extract_docs_and_labels_from_dict( @@
             ## Create Document
             cur_full_doc = Document(content=paragraph["context"], meta=cur_meta)
             if preprocessor is not None:
-                splits_docs = preprocessor.process(cur_full_doc)
+                splits_docs = preprocessor.process(documents=[cur_full_doc])
                 # we need to pull in _split_id into the document id for unique reference in labels
                 splits: List[Document] = []
                 offset = 0
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -109,8 +109,10 @@ def run_batch( # type: ignore @@
             documents: Union[List[Document], List[List[Document]]],
             top_k: Optional[int] = None,
             batch_size: Optional[int] = None,
+            labels: Optional[List[MultiLabel]] = None,
+            add_isolated_node_eval: bool = False,
         ):
-            self.query_count += len(queries) if isinstance(queries, list) else 1
+            self.query_count += len(queries)
             if not documents:
                 return {"answers": []}, "output_1"
@@ Expand All / @@ -121,17 +123,43 @@ def run_batch( # type: ignore @@
             # Add corresponding document_name and more meta data, if an answer contains the document_id
             answer_iterator = itertools.chain.from_iterable(results["answers"])
             if isinstance(documents[0], Document):
-                if isinstance(queries, list):
-                    answer_iterator = itertools.chain.from_iterable(itertools.chain.from_iterable(results["answers"]))
+                answer_iterator = itertools.chain.from_iterable(itertools.chain.from_iterable(results["answers"]))
             flattened_documents = []
             for doc_list in documents:
                 if isinstance(doc_list, list):
                     flattened_documents.extend(doc_list)
                 else:
                     flattened_documents.append(doc_list)
             for answer in answer_iterator:
                 BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer)
+            # run evaluation with labels as node inputs
+            if add_isolated_node_eval and labels is not None:
+                relevant_documents = []
+                for labelx in labels:
+                    relevant_documents.append([label.document for label in labelx.labels])
+                results_label_input = predict_batch(queries=queries, documents=relevant_documents, top_k=top_k)
+                # Add corresponding document_name and more meta data, if an answer contains the document_id
+                answer_iterator = itertools.chain.from_iterable(results_label_input["answers"])
+                if isinstance(documents[0], Document):
+                    if isinstance(queries, list):
+                        answer_iterator = itertools.chain.from_iterable(
+                            itertools.chain.from_iterable(results_label_input["answers"])
+                        )
+                flattened_documents = []
+                for doc_list in documents:
+                    if isinstance(doc_list, list):
+                        flattened_documents.extend(doc_list)
+                    else:
+                        flattened_documents.append(doc_list)
+                for answer in answer_iterator:
+                    BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer)
+                results["answers_isolated"] = results_label_input["answers"]
             return results, "output_1"
         def timing(self, fn, attr_name):
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -532,7 +532,7 @@ def retrieve( @@
         def retrieve_batch(
             self,
-            queries: List[str],
+            queries: Union[str, List[str]],
             filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
             top_k: Optional[int] = None,
             index: str = None,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -217,6 +217,65 @@ def eval( @@
             )
             return output
+        def eval_batch(
+            self,
+            labels: List[MultiLabel],
+            params: Optional[dict] = None,
+            sas_model_name_or_path: Optional[str] = None,
+            sas_batch_size: int = 32,
+            sas_use_gpu: bool = True,
+            add_isolated_node_eval: bool = False,
+            custom_document_id_field: Optional[str] = None,
+            context_matching_min_length: int = 100,
+            context_matching_boost_split_overlaps: bool = True,
+            context_matching_threshold: float = 65.0,
+        ) -> EvaluationResult:
+            """
+             Evaluates the pipeline by running the pipeline once per query in the debug mode
+             and putting together all data that is needed for evaluation, for example, calculating metrics.
+            To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`.
+             You can control the scope within which an Answer or a Document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`).
+             For some of these scopes, you need to add the following information during `eval()`:
+             - `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes).
+             - `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time).
+             :param labels: The labels to evaluate on.
+             :param params: Parameters for the `retriever` and `reader`. For instance,
+                            params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}.
+             :param sas_model_name_or_path: Sentence transformers semantic textual similarity model you want to use for the SAS value calculation.
+                                         It should be a path or a string pointing to downloadable models.
+             :param sas_batch_size: Number of prediction label pairs to encode at once by cross encoder or sentence transformer while calculating SAS.
+             :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+                                 Falls back to CPU if no GPU is available.
+             :param add_isolated_node_eval: Whether to additionally evaluate the reader based on labels as input, instead of the output of the previous node in the pipeline.
+             :param custom_document_id_field: Custom field name within `Document`'s `meta` which identifies the document and is used as a criterion for matching documents to labels during evaluation.
+                                              This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs, as these could be heavily influenced by preprocessing.
+                                              If not set, the default `Document`'s `id` is used as the criterion for matching documents to labels.
+             :param context_matching_min_length: The minimum string length context and candidate need to have to be scored.
+                                Returns 0.0 otherwise.
+             :param context_matching_boost_split_overlaps: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing parameters.
+                                      If we detect that the score is near a half match and the matching part of the candidate is at its boundaries,
+                                      we cut the context on the same side, recalculate the score, and take the mean of both.
+                                      Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
+             :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
+            """
+            output = self.pipeline.eval_batch(
+                labels=labels,
+                params=params,
+                sas_model_name_or_path=sas_model_name_or_path,
+                sas_batch_size=sas_batch_size,
+                sas_use_gpu=sas_use_gpu,
+                add_isolated_node_eval=add_isolated_node_eval,
+                custom_document_id_field=custom_document_id_field,
+                context_matching_boost_split_overlaps=context_matching_boost_split_overlaps,
+                context_matching_min_length=context_matching_min_length,
+                context_matching_threshold=context_matching_threshold,
+            )
+            return output
         def print_eval_report(
             self,
             eval_result: EvaluationResult,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add batch evaluation method for pipelines #2942

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

vblagoje Aug 23, 2022

Uh oh!

julian-risch Aug 24, 2022

Uh oh!

Uh oh!

vblagoje Aug 23, 2022

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -54,7 +54,7 @@ def test_generativeqa_calculate_metrics( @@
     @pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True)
     def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore, retriever_with_docs):
         document_store_with_docs.update_embeddings(retriever=retriever_with_docs)
-        summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=-1)
+        summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=False)
         pipeline = SearchSummarizationPipeline(
             retriever=retriever_with_docs, summarizer=summarizer, return_in_answer_format=True
         )
@@ Expand Down Expand Up @@
         assert "ESRetriever" in eval_result
         assert "DPRRetriever" in eval_result
-        assert "DPRRetriever" in eval_result
         assert "QAReader" in eval_result
         assert len(eval_result) == 3
@@ Expand Down @@

feat: add batch evaluation method for pipelines #2942

Uh oh!

feat: add batch evaluation method for pipelines #2942

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

vblagoje Aug 23, 2022

Choose a reason for hiding this comment

Uh oh!

julian-risch Aug 24, 2022

Choose a reason for hiding this comment

Uh oh!

Uh oh!

vblagoje Aug 23, 2022

Choose a reason for hiding this comment

Uh oh!

Uh oh!