feat: update quickstart loaded datasets (#4038)

gabrielmbmb · frascuchon · web-flow · commit 34389fdb5620 · 2023-10-25T16:12:04.000+02:00
# Description This PR removes loading the error analysis datasets from the `load_data.py` script used by the Docker quickstart image and adds loading the [Text Descriptives Metadata](https://huggingface.co/datasets/argilla/text-descriptives-metadata) dataset from Hugging Face. Closes #<issue_number> **Type of change** - [x] New feature (non-breaking change which adds functionality) **How Has This Been Tested** (Please describe the tests that you ran to verify your changes. And ideally, reference `tests`) - [ ] Test A - [ ] Test B **Checklist** - [ ] I added relevant documentation - [x] I followed the style guidelines of this project - [x] I did a self-review of my code - [ ] I made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I filled out [the contributor form](https://tally.so/r/n9XrxK) (see text above) - [ ] I have added relevant notes to the `CHANGELOG.md` file (See https://keepachangelog.com/) --------- Co-authored-by: Francisco Aranda <francis@argilla.io>
diff --git a/docker/scripts/load_data.py b/docker/scripts/load_data.py
@@ -162,104 +162,6 @@ def load_feedback_dataset_from_huggingface(repo_id: str, split: str = "train", s
 
         dataset.push_to_argilla(name=repo_id.split("/")[-1])
 
-    @staticmethod
-    def build_error_analysis_record(
-        row: pd.Series, legacy: bool = False
-    ) -> Union[rg.FeedbackRecord, rg.TextClassificationRecord]:
-        fields = {
-            "user-message-1": row["HumanMessage1"],
-            "llm-output": row["llm_output"]
-            if not row["llm_output"].__contains__("```json")
-            else row["llm_output"].replace("'", '"'),
-            "ai-message": (f"```json\n{row['AIMessage']}\n```" if not legacy else row["AIMessage"]).replace("'", '"'),
-            "function-message": (f"```json\n{row['FunctionMessage']}\n```" if not legacy else row["AIMessage"]).replace(
-                "'", '"'
-            ),
-            "system-message": "You are an AI assistant name ACME",
-            "langsmith-url": f"https://smith.langchain.com/o/{row['parent_id']}",
-        }
-        metadata = {
-            "correctness-langsmith": row["correctness_langsmith"],
-            "model-name": row["model_name"],
-            "temperature": row["temperature"],
-            "max-tokens": int(row["max_tokens"]),
-            "cpu-user": row["cpu_time_user"],
-            "cpu-system": row["cpu_time_system"],
-            "library-version": row["library_version"],
-        }
-
-        if legacy:
-            return rg.TextClassificationRecord(
-                inputs=fields, metadata=metadata, vectors=eval(row["vectors"]), multi_label=True
-            )
-        return rg.FeedbackRecord(fields=fields, metadata=metadata)
-
-    @staticmethod
-    def load_error_analysis(with_metadata_property_options: bool = True):
-        print("Loading Error Analysis dataset as a `FeedbackDataset` (Alpha)")
-        df = pd.read_csv("https://raw.githubusercontent.com/argilla-io/dataset_examples/main/synthetic_data_v2.csv")
-
-        fields = [
-            rg.TextField(name="user-message-1", use_markdown=True),
-            rg.TextField(name="llm-output", use_markdown=True),
-            rg.TextField(name="ai-message", use_markdown=True, required=False),
-            rg.TextField(name="function-message", use_markdown=True, required=False),
-            rg.TextField(name="system-message", use_markdown=True, required=False),
-            rg.TextField(name="langsmith-url", use_markdown=True, required=False),
-        ]
-
-        questions = [
-            rg.MultiLabelQuestion(
-                name="issue",
-                title="Please categorize the record:",
-                labels=["follow-up needed", "reviewed", "no-repro", "not-helpful", "empty-response", "critical"],
-            ),
-            rg.TextQuestion(name="note", title="Leave a note to describe the issue:", required=False),
-        ]
-
-        dataset_name = "error-analysis-with-feedback"
-
-        if with_metadata_property_options:
-            metadata = [
-                rg.TermsMetadataProperty(
-                    name="correctness-langsmith", values=df.correctness_langsmith.unique().tolist()
-                ),
-                rg.TermsMetadataProperty(name="model-name", values=df.model_name.unique().tolist()),
-                rg.FloatMetadataProperty(name="temperature", min=df.temperature.min(), max=df.temperature.max()),
-                rg.FloatMetadataProperty(name="cpu-user", min=df.cpu_time_user.min(), max=df.cpu_time_user.max()),
-                rg.FloatMetadataProperty(name="cpu-system", min=df.cpu_time_system.min(), max=df.cpu_time_system.max()),
-                rg.TermsMetadataProperty(name="library-version", values=df.library_version.unique().tolist()),
-            ]
-        else:
-            dataset_name += "-no-settings"
-
-            metadata = [
-                rg.TermsMetadataProperty(name="correctness-langsmith"),
-                rg.TermsMetadataProperty(name="model-name"),
-                rg.FloatMetadataProperty(name="temperature"),
-                rg.FloatMetadataProperty(name="cpu-user"),
-                rg.FloatMetadataProperty(name="cpu-system"),
-                rg.TermsMetadataProperty(name="library-version"),
-            ]
-
-        dataset = rg.FeedbackDataset(fields=fields, questions=questions, metadata_properties=metadata)
-        dataset.add_records(records=[LoadDatasets.build_error_analysis_record(row) for _, row in df.iterrows()])
-        dataset.push_to_argilla(name=dataset_name)
-
-    @staticmethod
-    def load_error_analysis_textcat_version():
-        print("Loading Error Analysis dataset as a `DatasetForTextClassification`")
-        df = pd.read_csv(
-            "https://raw.githubusercontent.com/argilla-io/dataset_examples/main/synthetic_data_v2_with_vectors.csv"
-        )
-
-        labels = ["follow-up needed", "reviewed", "no-repro", "not-helpful", "empty-response", "critical"]
-        settings = rg.TextClassificationSettings(label_schema=labels)
-        rg.configure_dataset_settings(name="error-analysis-with-text-classification", settings=settings)
-
-        records = [LoadDatasets.build_error_analysis_record(row, legacy=True) for _, row in df.iterrows()]
-        rg.log(name="error-analysis-with-text-classification", records=records, batch_size=25)
-
 
 if __name__ == "__main__":
     API_KEY = sys.argv[1]
@@ -274,9 +176,6 @@ def load_error_analysis_textcat_version():
                 response = requests.get("http://0.0.0.0:6900")
                 if response.status_code == 200:
                     ld = LoadDatasets(API_KEY)
-                    ld.load_error_analysis(with_metadata_property_options=False)
-                    ld.load_error_analysis()
-                    ld.load_error_analysis_textcat_version()
                     ld.load_feedback_dataset_from_huggingface(
                         repo_id="argilla/databricks-dolly-15k-curated-en", split="train", samples=100
                     )
@@ -296,6 +195,9 @@ def load_error_analysis_textcat_version():
                         ld.load_feedback_dataset_from_huggingface(
                             repo_id="argilla/oasst_response_comparison", split="train", samples=100
                         )
+                        ld.load_feedback_dataset_from_huggingface(
+                            repo_id="argilla/text-descriptives-metadata", split="train", samples=100
+                        )
             except requests.exceptions.ConnectionError:
                 pass
             except Exception as e:
diff --git a/src/argilla/client/feedback/integrations/huggingface/dataset.py b/src/argilla/client/feedback/integrations/huggingface/dataset.py
@@ -300,6 +300,13 @@ def from_huggingface(cls: Type["FeedbackDataset"], repo_id: str, *args: Any, **k
             )
             with open(config_path, "r") as f:
                 config = DatasetConfig.from_yaml(f.read())
+                dataset = cls(
+                    fields=config.fields,
+                    questions=config.questions,
+                    guidelines=config.guidelines,
+                    metadata_properties=config.metadata_properties,
+                    allow_extra_metadata=config.allow_extra_metadata,
+                )
         except EntryNotFoundError:
             # TODO(alvarobartt): here for backwards compatibility, last used in 1.12.0
             warnings.warn(
@@ -318,6 +325,7 @@ def from_huggingface(cls: Type["FeedbackDataset"], repo_id: str, *args: Any, **k
             )
             with open(config_path, "r") as f:
                 config = DeprecatedDatasetConfig.from_json(f.read())
+                dataset = cls(fields=config.fields, questions=config.questions, guidelines=config.guidelines)
         except Exception as e:
             raise FileNotFoundError(
                 "Neither `argilla.yaml` nor `argilla.cfg` files were found in the"
@@ -340,7 +348,7 @@ def from_huggingface(cls: Type["FeedbackDataset"], repo_id: str, *args: Any, **k
             responses = {}
             suggestions = []
             user_without_id = False
-            for question in config.questions:
+            for question in dataset.questions:
                 if hfds[index][question.name] is not None and len(hfds[index][question.name]) > 0:
                     if (
                         len(
@@ -414,20 +422,15 @@ def from_huggingface(cls: Type["FeedbackDataset"], repo_id: str, *args: Any, **k
 
             records.append(
                 FeedbackRecord(
-                    fields={field.name: hfds[index][field.name] for field in config.fields},
+                    fields={field.name: hfds[index][field.name] for field in dataset.fields},
                     metadata=metadata or {},
                     responses=list(responses.values()) or [],
                     suggestions=[suggestion for suggestion in suggestions if suggestion["value"] is not None] or [],
                     external_id=hfds[index]["external_id"],
                 )
             )
         del hfds
-        instance = cls(
-            fields=config.fields,
-            questions=config.questions,
-            guidelines=config.guidelines,
-            metadata_properties=config.metadata_properties,
-            allow_extra_metadata=config.allow_extra_metadata,
-        )
-        instance.add_records(records)
-        return instance
+
+        dataset.add_records(records)
+
+        return dataset