wip: query analysis

langchain-ai · Mar 4, 2024 · d783816 · d783816
1 parent 22a9eee
commit d783816
Show file tree

Hide file tree

Showing 9 changed files with 513 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -173,13 +173,13 @@ poetry install --with lint,dev,test
 Run the following script to create a database and schema:
 
 ```sh
-python -m scripts.run_migrations create 
+poetry run python -m scripts.run_migrations create 
 ```
 
 From `/backend`:
 
 ```sh
-OPENAI_API_KEY=[YOUR API KEY] python -m server.main
+OPENAI_API_KEY=[YOUR API KEY] poetry run python -m server.main
 ```
 
 ### Testing 
@@ -189,7 +189,7 @@ separate from the main database. It will have the same schema as the main
 database.
 
 ```sh
-python -m scripts.run_migrations create-test-db
+poetry run python -m scripts.run_migrations create-test-db
 ```
 
 Run the tests

diff --git a/backend/db/models.py b/backend/db/models.py
@@ -6,7 +6,6 @@
 from sqlalchemy.dialects.postgresql import JSONB, UUID
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, relationship, sessionmaker
-from sqlalchemy.sql import func
 
 from server.settings import get_postgres_url
 
@@ -123,3 +122,70 @@ class Example(TimestampedModel):
 
     def __repr__(self) -> str:
         return f"<Example(uuid={self.uuid}, content={self.content[:20]}>"
+
+
+class QueryAnalyzer(TimestampedModel):
+    __tablename__ = "query_analyzers"
+
+    name = Column(
+        String(100),
+        nullable=False,
+        server_default="",
+        comment="The name of the query analyser.",
+    )
+    schema = Column(
+        JSONB,
+        nullable=False,
+        comment="JSON Schema that describes schema of query",
+    )
+    description = Column(
+        String(100),
+        nullable=False,
+        server_default="",
+        comment="Surfaced via UI to the users.",
+    )
+    instruction = Column(
+        Text, nullable=False, comment="The prompt to the language model."
+    )  # TODO: This will need to evolve
+
+    examples = relationship("QueryAnalysisExample", backref="query_analyzer")
+
+    def __repr__(self) -> str:
+        return f"<QueryAnalyzer(id={self.uuid}, description={self.description})>"
+
+
+class QueryAnalysisExample(TimestampedModel):
+    """A representation of an example.
+
+    Examples consist of content together with the expected output.
+
+    The output is a JSON object that is expected to be extracted from the content.
+
+    The JSON object should be valid according to the schema of the associated extractor.
+
+    The JSON object is defined by the schema of the associated extractor, so
+    it's perfectly fine for a given example to represent the extraction
+    of multiple instances of some object from the content since
+    the JSON schema can represent a list of objects.
+    """
+
+    __tablename__ = "query_analysis_examples"
+
+    content = Column(
+        JSONB,
+        nullable=False,
+        comment="The input portion of the example.",
+    )
+    output = Column(
+        JSONB,
+        comment="The output associated with the example.",
+    )
+    query_analyzer_id = Column(
+        UUID(as_uuid=True),
+        ForeignKey("query_analyzers.uuid", ondelete="CASCADE"),
+        nullable=False,
+        comment="Foreign key referencing the associated query analyzer.",
+    )
+
+    def __repr__(self) -> str:
+        return f"<QueryAnalysisExample(uuid={self.uuid}, content={self.content[:20]}>"
diff --git a/backend/extraction/utils.py b/backend/extraction/utils.py
@@ -43,10 +43,8 @@ def convert_json_schema_to_openai_schema(
     else:
         raise NotImplementedError("Only multi is supported for now.")
 
-    schema_.pop("definitions", None)
-
     return {
-        "name": "extractor",
-        "description": "Extract information matching the given schema.",
+        "name": "query_analyzer",
+        "description": "Generate optimized queries matching the given schema.",
         "parameters": _rm_titles(schema_) if rm_titles else schema_,
     }
diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -14,7 +14,7 @@ fastapi = "^0.109.2"
 langserve = "^0.0.45"
 uvicorn = "^0.27.1"
 pydantic = "^1.10"
-langchain-openai = "^0.0.6"
+langchain-openai = "^0.0.8"
 jsonschema = "^4.21.1"
 sse-starlette = "^2.0.0"
 alembic = "^1.13.1"

diff --git a/backend/server/main.py b/backend/server/main.py
@@ -10,6 +10,11 @@
     ExtractResponse,
     extraction_runnable,
 )
+from server.query_analysis import (
+    QueryAnalysisRequest,
+    QueryAnalysisResponse,
+    query_analyzer,
+)
 
 app = FastAPI(
     title="Extraction Powered by LangChain",
@@ -56,6 +61,14 @@ def ready():
     enabled_endpoints=["invoke", "batch"],
 )
 
+add_routes(
+    app,
+    query_analyzer.with_types(
+        input_type=QueryAnalysisRequest, output_type=QueryAnalysisResponse
+    ),
+    path="/query_analysis",
+    enabled_endpoints=["invoke", "batch"],
+)
 
 if __name__ == "__main__":
     import uvicorn