From a0704227b36436525c0fb4a29550b329bde352c3 Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Fri, 1 Sep 2023 13:32:40 -0700
Subject: [PATCH 1/8] for classification tasks with a large number of
 categories, filter the list of labels by similarity to the prompt

---
 src/autolabel/tasks/classification.py | 39 +++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/autolabel/tasks/classification.py b/src/autolabel/tasks/classification.py
index fe6c5066..e9c64344 100644
--- a/src/autolabel/tasks/classification.py
+++ b/src/autolabel/tasks/classification.py
@@ -55,6 +55,45 @@ def construct_prompt(self, input: Dict, examples: List) -> str:
         # prepare task guideline
         labels_list = self.config.labels_list()
         num_labels = len(labels_list)
+
+        # if large number of labels, filter by similarity to input
+        if num_labels >= 50:
+            from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
+            from langchain.vectorstores import Chroma
+            from langchain.embeddings import OpenAIEmbeddings
+            from langchain.prompts import FewShotPromptTemplate, PromptTemplate
+            print(f"PC_DEBUG :: input = {input}")
+            example_prompt = PromptTemplate(
+                input_variables=["input"],
+                template="{input}",
+            )
+            examples = [{"input":label} for label in labels_list]
+
+            example_selector = SemanticSimilarityExampleSelector.from_examples(
+                # This is the list of examples available to select from.
+                examples,
+                # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
+                OpenAIEmbeddings(),
+                # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
+                Chroma,
+                # This is the number of examples to produce.
+                k=5
+            )
+            similar_prompt = FewShotPromptTemplate(
+                # We provide an ExampleSelector instead of examples.
+                example_selector=example_selector,
+                example_prompt=example_prompt,
+                prefix="Input: {example}\n",
+                suffix="",
+                input_variables=["example"],
+            )
+            print(similar_prompt.format(example=input["example"]))
+            #sampled_labels = similar_prompt.format(example=input["example"])
+            #print(sampled_labels)
+            #print(type(sampled_labels))
+            #print(len(sampled_labels))
+            exit()
+
         fmt_task_guidelines = self.task_guidelines.format(
             num_labels=num_labels, labels="\n".join(labels_list)
         )

From e3c2126f75bd1296b66510c0dc3abd1ab9a5e737 Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Fri, 1 Sep 2023 15:00:33 -0700
Subject: [PATCH 2/8] replace Chroma DB with autolabels own VectorStoreWrapper.
 Remove debug prints

---
 src/autolabel/tasks/classification.py | 38 +++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/autolabel/tasks/classification.py b/src/autolabel/tasks/classification.py
index e9c64344..ba1faf71 100644
--- a/src/autolabel/tasks/classification.py
+++ b/src/autolabel/tasks/classification.py
@@ -21,6 +21,11 @@
 
 import json
 
+from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.prompts import FewShotPromptTemplate, PromptTemplate
+from autolabel.few_shot.vector_store import VectorStoreWrapper
+
 
 class ClassificationTask(BaseTask):
     DEFAULT_OUTPUT_GUIDELINES = (
@@ -56,43 +61,38 @@ def construct_prompt(self, input: Dict, examples: List) -> str:
         labels_list = self.config.labels_list()
         num_labels = len(labels_list)
 
-        # if large number of labels, filter by similarity to input
+        # if large number of labels, filter labels_list by similarity of labels to input
         if num_labels >= 50:
-            from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
-            from langchain.vectorstores import Chroma
-            from langchain.embeddings import OpenAIEmbeddings
-            from langchain.prompts import FewShotPromptTemplate, PromptTemplate
-            print(f"PC_DEBUG :: input = {input}")
             example_prompt = PromptTemplate(
                 input_variables=["input"],
                 template="{input}",
             )
-            examples = [{"input":label} for label in labels_list]
+            label_examples = [{"input": label} for label in labels_list]
 
             example_selector = SemanticSimilarityExampleSelector.from_examples(
-                # This is the list of examples available to select from.
-                examples,
+                # This is the list of labels available to select from.
+                label_examples,
                 # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
                 OpenAIEmbeddings(),
                 # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
-                Chroma,
+                VectorStoreWrapper(cache=False),
                 # This is the number of examples to produce.
-                k=5
+                k=10,
             )
             similar_prompt = FewShotPromptTemplate(
-                # We provide an ExampleSelector instead of examples.
                 example_selector=example_selector,
                 example_prompt=example_prompt,
-                prefix="Input: {example}\n",
+                prefix="Input: {example}",
                 suffix="",
                 input_variables=["example"],
             )
-            print(similar_prompt.format(example=input["example"]))
-            #sampled_labels = similar_prompt.format(example=input["example"])
-            #print(sampled_labels)
-            #print(type(sampled_labels))
-            #print(len(sampled_labels))
-            exit()
+            sampled_labels = similar_prompt.format(example=input["example"])
+            split_lines = sampled_labels.split("\n")
+            labels_list = []
+            for i in range(1, len(split_lines)):
+                if split_lines[i]:
+                    labels_list.append(split_lines[i])
+            num_labels = len(labels_list)
 
         fmt_task_guidelines = self.task_guidelines.format(
             num_labels=num_labels, labels="\n".join(labels_list)

From 3cfca5491ec3c972ff247c5735a00588a22aff8b Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Wed, 6 Sep 2023 16:34:27 -0700
Subject: [PATCH 3/8] move label selection logic into its own class

---
 src/autolabel/configs/config.py          |  7 +++
 src/autolabel/configs/schema.py          |  1 +
 src/autolabel/few_shot/label_selector.py | 59 ++++++++++++++++++++++++
 src/autolabel/labeler.py                 | 29 ++++++++++--
 src/autolabel/tasks/classification.py    | 41 +++-------------
 5 files changed, 99 insertions(+), 38 deletions(-)
 create mode 100644 src/autolabel/few_shot/label_selector.py

diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
index 1a340cec..a0110f2b 100644
--- a/src/autolabel/configs/config.py
+++ b/src/autolabel/configs/config.py
@@ -47,6 +47,7 @@ class AutolabelConfig(BaseConfig):
     OUTPUT_GUIDELINE_KEY = "output_guidelines"
     OUTPUT_FORMAT_KEY = "output_format"
     CHAIN_OF_THOUGHT_KEY = "chain_of_thought"
+    LABEL_SELECTION_KEY = "label_selection"
     TRANSFORM_KEY = "transforms"
 
     # Dataset generation config keys (config["dataset_generation"][<key>])
@@ -201,6 +202,12 @@ def chain_of_thought(self) -> bool:
         """Returns true if the model is able to perform chain of thought reasoning."""
         return self._prompt_config.get(self.CHAIN_OF_THOUGHT_KEY, False)
 
+    def label_selection(self) -> bool:
+        """Returns true if label selection is enabled. Label selection is the process of
+        narrowing down the list of legal labels by similarity to a given input. Useful for
+        classification tasks with a large number of possible classes."""
+        return self._prompt_config.get(self.LABEL_SELECTION_KEY, False)
+
     def transforms(self) -> List[Dict]:
         """Returns a list of transforms to apply to the data before sending to the model."""
         return self.config.get(self.TRANSFORM_KEY, [])
diff --git a/src/autolabel/configs/schema.py b/src/autolabel/configs/schema.py
index fe2a982e..96d3a958 100644
--- a/src/autolabel/configs/schema.py
+++ b/src/autolabel/configs/schema.py
@@ -120,6 +120,7 @@ def populate_few_shot_selection() -> List[str]:
                 },
                 "few_shot_num": {"type": ["number", "null"]},
                 "chain_of_thought": {"type": ["boolean", "null"]},
+                "label_selection": {"type": ["boolean", "null"]},
             },
             "required": ["task_guidelines"],
             "additionalProperties": False,
diff --git a/src/autolabel/few_shot/label_selector.py b/src/autolabel/few_shot/label_selector.py
new file mode 100644
index 00000000..030e7f71
--- /dev/null
+++ b/src/autolabel/few_shot/label_selector.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from typing import Dict, List
+import bisect
+
+from autolabel.few_shot.vector_store import cos_sim
+
+from langchain.embeddings.openai import OpenAIEmbeddings
+
+
+class LabelSelector:
+    """Returns the most similar labels to a given input. Used for
+    classification tasks with a large number of possible classes."""
+
+    labels: List[str]
+    """A list of the possible labels to choose from."""
+
+    k: int = 10
+    """Number of labels to select"""
+
+    embedding_func = OpenAIEmbeddings()
+    """Function used to generate embeddings of labels/input"""
+
+    labels_embeddings: Dict = {}
+    """Dict used to store embeddings of each label"""
+
+    def __init__(
+        self, labels: List[str], k: int = 10, embedding_func=OpenAIEmbeddings()
+    ) -> None:
+        self.labels = labels
+        self.k = min(k, len(labels))
+        self.embedding_func = embedding_func
+        for l in self.labels:
+            self.labels_embeddings[l] = self.embedding_func.embed_query(l)
+
+    def select_labels(self, input: str) -> List[str]:
+        """Select which labels to use based on the similarity to input"""
+        input_embedding = self.embedding_func.embed_query(input)
+
+        scores = []
+        for label, embedding in self.labels_embeddings.items():
+            similarity = cos_sim(embedding, input_embedding)
+            # insert into scores, while maintaining sorted order
+            bisect.insort(scores, (similarity, label))
+        return [label for (_, label) in scores[-self.k :]]
+
+    @classmethod
+    def from_examples(
+        cls,
+        labels: List[str],
+        k: int = 10,
+        embedding_func=OpenAIEmbeddings(),
+    ) -> LabelSelector:
+        """Create pass-through label selector using given list of labels
+
+        Returns:
+            The LabelSelector instantiated
+        """
+        return cls(labels=labels, k=k, embedding_func=embedding_func)
diff --git a/src/autolabel/labeler.py b/src/autolabel/labeler.py
index f58b36d1..1abc3628 100644
--- a/src/autolabel/labeler.py
+++ b/src/autolabel/labeler.py
@@ -17,6 +17,7 @@
 from autolabel.data_models import AnnotationModel, TaskRunModel
 from autolabel.database import StateManager
 from autolabel.few_shot import ExampleSelectorFactory, BaseExampleSelector
+from autolabel.few_shot.label_selector import LabelSelector
 from autolabel.models import BaseModel, ModelFactory
 from autolabel.metrics import BaseMetric
 from autolabel.transforms import BaseTransform, TransformFactory
@@ -165,6 +166,11 @@ def run(
                 cache=self.generation_cache is not None,
             )
 
+        if self.config.label_selection():
+            self.label_selector = LabelSelector.from_examples(
+                labels=self.config.labels_list()
+            )
+
         current_index = self.task_run.current_index if self.create_task else 0
         cost = 0.0
         postfix_dict = {}
@@ -185,8 +191,14 @@ def run(
                 )
             else:
                 examples = []
-                # Construct Prompt to pass to LLM
-            final_prompt = self.task.construct_prompt(chunk, examples)
+            # Construct Prompt to pass to LLM
+            if self.config.label_selection():
+                selected_labels = self.label_selector.select_labels(chunk["example"])
+                final_prompt = self.task.construct_prompt(
+                    chunk, examples, selected_labels
+                )
+            else:
+                final_prompt = self.task.construct_prompt(chunk, examples)
 
             response = self.llm.label([final_prompt])
             for i, generations, error in zip(
@@ -332,6 +344,11 @@ def plan(
             cache=self.generation_cache is not None,
         )
 
+        if self.config.label_selection():
+            self.label_selector = LabelSelector.from_examples(
+                labels=self.config.labels_list()
+            )
+
         input_limit = min(len(dataset.inputs), 100)
 
         for input_i in track(
@@ -346,7 +363,13 @@ def plan(
                 )
             else:
                 examples = []
-            final_prompt = self.task.construct_prompt(input_i, examples)
+            if self.config.label_selection():
+                selected_labels = self.label_selector.select_labels(input_i["example"])
+                final_prompt = self.task.construct_prompt(
+                    input_i, examples, selected_labels
+                )
+            else:
+                final_prompt = self.task.construct_prompt(input_i, examples)
             prompt_list.append(final_prompt)
 
             # Calculate the number of tokens
diff --git a/src/autolabel/tasks/classification.py b/src/autolabel/tasks/classification.py
index ba1faf71..5af82412 100644
--- a/src/autolabel/tasks/classification.py
+++ b/src/autolabel/tasks/classification.py
@@ -53,47 +53,18 @@ def __init__(self, config: AutolabelConfig) -> None:
         if self.config.confidence():
             self.metrics.append(AUROCMetric())
 
-    def construct_prompt(self, input: Dict, examples: List) -> str:
+    def construct_prompt(
+        self, input: Dict, examples: List, selected_labels: List[str] = None
+    ) -> str:
         # Copy over the input so that we can modify it
         input = input.copy()
 
         # prepare task guideline
-        labels_list = self.config.labels_list()
+        labels_list = (
+            self.config.labels_list() if not selected_labels else selected_labels
+        )
         num_labels = len(labels_list)
 
-        # if large number of labels, filter labels_list by similarity of labels to input
-        if num_labels >= 50:
-            example_prompt = PromptTemplate(
-                input_variables=["input"],
-                template="{input}",
-            )
-            label_examples = [{"input": label} for label in labels_list]
-
-            example_selector = SemanticSimilarityExampleSelector.from_examples(
-                # This is the list of labels available to select from.
-                label_examples,
-                # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
-                OpenAIEmbeddings(),
-                # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
-                VectorStoreWrapper(cache=False),
-                # This is the number of examples to produce.
-                k=10,
-            )
-            similar_prompt = FewShotPromptTemplate(
-                example_selector=example_selector,
-                example_prompt=example_prompt,
-                prefix="Input: {example}",
-                suffix="",
-                input_variables=["example"],
-            )
-            sampled_labels = similar_prompt.format(example=input["example"])
-            split_lines = sampled_labels.split("\n")
-            labels_list = []
-            for i in range(1, len(split_lines)):
-                if split_lines[i]:
-                    labels_list.append(split_lines[i])
-            num_labels = len(labels_list)
-
         fmt_task_guidelines = self.task_guidelines.format(
             num_labels=num_labels, labels="\n".join(labels_list)
         )

From f97a82d57b03bc3da5c59604b23ed7a573d230f7 Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Wed, 6 Sep 2023 17:01:05 -0700
Subject: [PATCH 4/8] allow for LabelSelector.k to be specified in config

---
 src/autolabel/configs/config.py |  5 +++++
 src/autolabel/configs/schema.py |  1 +
 src/autolabel/labeler.py        | 19 ++++++++++++++++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
index a0110f2b..f60138ea 100644
--- a/src/autolabel/configs/config.py
+++ b/src/autolabel/configs/config.py
@@ -48,6 +48,7 @@ class AutolabelConfig(BaseConfig):
     OUTPUT_FORMAT_KEY = "output_format"
     CHAIN_OF_THOUGHT_KEY = "chain_of_thought"
     LABEL_SELECTION_KEY = "label_selection"
+    LABEL_SELECTION_COUNT_KEY = "label_selection_count"
     TRANSFORM_KEY = "transforms"
 
     # Dataset generation config keys (config["dataset_generation"][<key>])
@@ -208,6 +209,10 @@ def label_selection(self) -> bool:
         classification tasks with a large number of possible classes."""
         return self._prompt_config.get(self.LABEL_SELECTION_KEY, False)
 
+    def label_selection_count(self) -> int:
+        """Returns the number of labels to select in LabelSelector"""
+        return self._prompt_config.get(self.LABEL_SELECTION_COUNT_KEY, 10)
+
     def transforms(self) -> List[Dict]:
         """Returns a list of transforms to apply to the data before sending to the model."""
         return self.config.get(self.TRANSFORM_KEY, [])
diff --git a/src/autolabel/configs/schema.py b/src/autolabel/configs/schema.py
index 96d3a958..4e3a5af1 100644
--- a/src/autolabel/configs/schema.py
+++ b/src/autolabel/configs/schema.py
@@ -121,6 +121,7 @@ def populate_few_shot_selection() -> List[str]:
                 "few_shot_num": {"type": ["number", "null"]},
                 "chain_of_thought": {"type": ["boolean", "null"]},
                 "label_selection": {"type": ["boolean", "null"]},
+                "label_selection_count": {"type": ["number", "null"]},
             },
             "required": ["task_guidelines"],
             "additionalProperties": False,
diff --git a/src/autolabel/labeler.py b/src/autolabel/labeler.py
index 1abc3628..944e41a7 100644
--- a/src/autolabel/labeler.py
+++ b/src/autolabel/labeler.py
@@ -16,7 +16,12 @@
 from autolabel.dataset import AutolabelDataset
 from autolabel.data_models import AnnotationModel, TaskRunModel
 from autolabel.database import StateManager
-from autolabel.few_shot import ExampleSelectorFactory, BaseExampleSelector
+from autolabel.few_shot import (
+    ExampleSelectorFactory,
+    BaseExampleSelector,
+    DEFAULT_EMBEDDING_PROVIDER,
+    PROVIDER_TO_MODEL,
+)
 from autolabel.few_shot.label_selector import LabelSelector
 from autolabel.models import BaseModel, ModelFactory
 from autolabel.metrics import BaseMetric
@@ -168,7 +173,11 @@ def run(
 
         if self.config.label_selection():
             self.label_selector = LabelSelector.from_examples(
-                labels=self.config.labels_list()
+                labels=self.config.labels_list(),
+                k=self.config.label_selection_count(),
+                embedding_func=PROVIDER_TO_MODEL.get(
+                    self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
+                )(),
             )
 
         current_index = self.task_run.current_index if self.create_task else 0
@@ -346,7 +355,11 @@ def plan(
 
         if self.config.label_selection():
             self.label_selector = LabelSelector.from_examples(
-                labels=self.config.labels_list()
+                labels=self.config.labels_list(),
+                k=self.config.label_selection_count(),
+                embedding_func=PROVIDER_TO_MODEL.get(
+                    self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
+                )(),
             )
 
         input_limit = min(len(dataset.inputs), 100)

From 16f769b849c631efcdc0c922dfcb12c9241bd7a2 Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Wed, 6 Sep 2023 17:33:26 -0700
Subject: [PATCH 5/8] clear up comment

---
 src/autolabel/configs/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
index f60138ea..1d8fcced 100644
--- a/src/autolabel/configs/config.py
+++ b/src/autolabel/configs/config.py
@@ -205,7 +205,7 @@ def chain_of_thought(self) -> bool:
 
     def label_selection(self) -> bool:
         """Returns true if label selection is enabled. Label selection is the process of
-        narrowing down the list of legal labels by similarity to a given input. Useful for
+        narrowing down the list of possible labels by similarity to a given input. Useful for
         classification tasks with a large number of possible classes."""
         return self._prompt_config.get(self.LABEL_SELECTION_KEY, False)
 

From 547e8116a69767ec42b585daa3c96feba7f7325b Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Thu, 7 Sep 2023 14:53:48 -0700
Subject: [PATCH 6/8] remove default for embedding_func=OpenAIEmbeddings() , as
 this requires having OPENAI_API_KEY when importing autolabel

---
 src/autolabel/few_shot/label_selector.py | 10 ++++------
 src/autolabel/labeler.py                 |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/autolabel/few_shot/label_selector.py b/src/autolabel/few_shot/label_selector.py
index 030e7f71..7387654a 100644
--- a/src/autolabel/few_shot/label_selector.py
+++ b/src/autolabel/few_shot/label_selector.py
@@ -1,12 +1,10 @@
 from __future__ import annotations
-
+from collections.abc import Callable
 from typing import Dict, List
 import bisect
 
 from autolabel.few_shot.vector_store import cos_sim
 
-from langchain.embeddings.openai import OpenAIEmbeddings
-
 
 class LabelSelector:
     """Returns the most similar labels to a given input. Used for
@@ -18,14 +16,14 @@ class LabelSelector:
     k: int = 10
     """Number of labels to select"""
 
-    embedding_func = OpenAIEmbeddings()
+    embedding_func: Callable = None
     """Function used to generate embeddings of labels/input"""
 
     labels_embeddings: Dict = {}
     """Dict used to store embeddings of each label"""
 
     def __init__(
-        self, labels: List[str], k: int = 10, embedding_func=OpenAIEmbeddings()
+        self, labels: List[str], embedding_func: Callable, k: int = 10
     ) -> None:
         self.labels = labels
         self.k = min(k, len(labels))
@@ -48,8 +46,8 @@ def select_labels(self, input: str) -> List[str]:
     def from_examples(
         cls,
         labels: List[str],
+        embedding_func,
         k: int = 10,
-        embedding_func=OpenAIEmbeddings(),
     ) -> LabelSelector:
         """Create pass-through label selector using given list of labels
 
diff --git a/src/autolabel/labeler.py b/src/autolabel/labeler.py
index 944e41a7..f2446792 100644
--- a/src/autolabel/labeler.py
+++ b/src/autolabel/labeler.py
@@ -174,10 +174,10 @@ def run(
         if self.config.label_selection():
             self.label_selector = LabelSelector.from_examples(
                 labels=self.config.labels_list(),
-                k=self.config.label_selection_count(),
                 embedding_func=PROVIDER_TO_MODEL.get(
                     self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
                 )(),
+                k=self.config.label_selection_count(),
             )
 
         current_index = self.task_run.current_index if self.create_task else 0
@@ -356,10 +356,10 @@ def plan(
         if self.config.label_selection():
             self.label_selector = LabelSelector.from_examples(
                 labels=self.config.labels_list(),
-                k=self.config.label_selection_count(),
                 embedding_func=PROVIDER_TO_MODEL.get(
                     self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
                 )(),
+                k=self.config.label_selection_count(),
             )
 
         input_limit = min(len(dataset.inputs), 100)

From 2a6ec296cb75347fc662bc0bf248489e00bc1e08 Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Thu, 7 Sep 2023 15:39:26 -0700
Subject: [PATCH 7/8] if task_selection=true, check that
 task_type=classification

---
 src/autolabel/labeler.py | 49 +++++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/autolabel/labeler.py b/src/autolabel/labeler.py
index f2446792..05b2e78c 100644
--- a/src/autolabel/labeler.py
+++ b/src/autolabel/labeler.py
@@ -31,6 +31,7 @@
     MetricResult,
     TaskRun,
     TaskStatus,
+    TaskType,
 )
 from autolabel.tasks import TaskFactory
 from autolabel.utils import (
@@ -172,13 +173,18 @@ def run(
             )
 
         if self.config.label_selection():
-            self.label_selector = LabelSelector.from_examples(
-                labels=self.config.labels_list(),
-                embedding_func=PROVIDER_TO_MODEL.get(
-                    self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
-                )(),
-                k=self.config.label_selection_count(),
-            )
+            if self.config.task_type() != TaskType.CLASSIFICATION:
+                self.console.print(
+                    "Warning: label_selection only supported for classification tasks!"
+                )
+            else:
+                self.label_selector = LabelSelector.from_examples(
+                    labels=self.config.labels_list(),
+                    embedding_func=PROVIDER_TO_MODEL.get(
+                        self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
+                    )(),
+                    k=self.config.label_selection_count(),
+                )
 
         current_index = self.task_run.current_index if self.create_task else 0
         cost = 0.0
@@ -201,7 +207,10 @@ def run(
             else:
                 examples = []
             # Construct Prompt to pass to LLM
-            if self.config.label_selection():
+            if (
+                self.config.label_selection()
+                and self.config.task_type() == TaskType.CLASSIFICATION
+            ):
                 selected_labels = self.label_selector.select_labels(chunk["example"])
                 final_prompt = self.task.construct_prompt(
                     chunk, examples, selected_labels
@@ -354,13 +363,18 @@ def plan(
         )
 
         if self.config.label_selection():
-            self.label_selector = LabelSelector.from_examples(
-                labels=self.config.labels_list(),
-                embedding_func=PROVIDER_TO_MODEL.get(
-                    self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
-                )(),
-                k=self.config.label_selection_count(),
-            )
+            if self.config.task_type() != TaskType.CLASSIFICATION:
+                self.console.print(
+                    "Warning: label_selection only supported for classification tasks!"
+                )
+            else:
+                self.label_selector = LabelSelector.from_examples(
+                    labels=self.config.labels_list(),
+                    embedding_func=PROVIDER_TO_MODEL.get(
+                        self.config.embedding_provider(), DEFAULT_EMBEDDING_PROVIDER
+                    )(),
+                    k=self.config.label_selection_count(),
+                )
 
         input_limit = min(len(dataset.inputs), 100)
 
@@ -376,7 +390,10 @@ def plan(
                 )
             else:
                 examples = []
-            if self.config.label_selection():
+            if (
+                self.config.label_selection()
+                and self.config.task_type() == TaskType.CLASSIFICATION
+            ):
                 selected_labels = self.label_selector.select_labels(input_i["example"])
                 final_prompt = self.task.construct_prompt(
                     input_i, examples, selected_labels

From e5ff12a1a3a674de746ad9f49b55511486c15f9d Mon Sep 17 00:00:00 2001
From: Patrick Cramer <pcramer247@gmail.com>
Date: Thu, 7 Sep 2023 15:41:48 -0700
Subject: [PATCH 8/8] remove unnused imports

---
 src/autolabel/tasks/classification.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/autolabel/tasks/classification.py b/src/autolabel/tasks/classification.py
index 5af82412..3d154bff 100644
--- a/src/autolabel/tasks/classification.py
+++ b/src/autolabel/tasks/classification.py
@@ -21,11 +21,6 @@
 
 import json
 
-from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.prompts import FewShotPromptTemplate, PromptTemplate
-from autolabel.few_shot.vector_store import VectorStoreWrapper
-
 
 class ClassificationTask(BaseTask):
     DEFAULT_OUTPUT_GUIDELINES = (