feat(integrations): updaye spider api call to support async + removed duplicate code + readme update

Vedantsahai18 · Vedantsahai18 · commit 5221f2288cc9 · 2024-12-02T18:20:46.000-05:00
diff --git a/README.md b/README.md
@@ -76,7 +76,6 @@ Your contributions, big or small, are valuable to us. Let's build something amaz
 - [Integrations](#integrations)
 - [Other Features](#other-features)
   - [Adding Tools to Agents](#adding-tools-to-agents)
-  - [Managing Sessions and Users](#managing-sessions-and-users)
   - [Document Integration and Search](#document-integration-and-search)
 - [Reference](#reference)
   - [SDK Reference](#sdk-reference)
@@ -86,6 +85,15 @@ Your contributions, big or small, are valuable to us. Let's build something amaz
   - [Different Use Cases](#different-use-cases)
   - [Different Form Factor](#different-form-factor)
   - [In Summary](#in-summary)
+  - [Document Integration and Search](#document-integration-and-search-1)
+- [Reference](#reference-1)
+  - [SDK Reference](#sdk-reference-1)
+  - [API Reference](#api-reference-1)
+- [Local Quickstart](#local-quickstart-1)
+- [What's the difference between Julep and LangChain etc?](#whats-the-difference-between-julep-and-langchain-etc-1)
+  - [Different Use Cases](#different-use-cases-1)
+  - [Different Form Factor](#different-form-factor-1)
+  - [In Summary](#in-summary-1)
 
 <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 
@@ -1400,6 +1408,9 @@ output:
 <td>
 
 ```yaml
+setup:
+  # No specific setup parameters are required for Wikipedia
+
 arguments:
   query: string # The search query string
   load_max_docs: integer # (Optional) Maximum number of documents to load. Default is 2.
@@ -1409,7 +1420,6 @@ output:
 ```
 
 </td>
-
 <td>
 
 **Example cookbook**: [cookbooks/03-trip-planning-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/03-trip-planning-assistant.ipynb)
@@ -1433,7 +1443,6 @@ output:
 ```
 
 </td>
-
 </tr>
 
 <tr>
@@ -1456,9 +1465,14 @@ output:
 ```
 
 </td>
+<td>
+
+**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)
 
+</td>
 </tr>
 
+
 <tr>
 <td> <b>Cloudinary</b> </td>
 <td>
@@ -1489,16 +1503,45 @@ output:
 ```
 
 </td>
-
 <td>
 
 **Example cookbook**: [cookbooks/05-video-processing-with-natural-language.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/05-video-processing-with-natural-language.ipynb)
 
 </td>
 </tr>
 
-</table>
+<tr>
+<td> <b>Arxiv</b> </td>
+<td>
 
+```yaml
+method: search # The method to use for the Arxiv integration
+
+setup:
+  # No specific setup parameters are required for Arxiv
+
+arguments:
+  query: string # The search query for searching with Arxiv
+  id_list: list[string] | None # (Optional) The list of Arxiv IDs to search with
+  max_results: integer # The maximum number of results to return, must be between 1 and 300000
+  download_pdf: boolean # Whether to download the PDF of the results. Default is false.
+  sort_by: string # The sort criterion for the results, options: relevance, lastUpdatedDate, submittedDate
+  sort_order: string # The sort order for the results, options: ascending, descending
+
+output:
+  result: list # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded
+```
+
+</td>
+
+<td>
+
+**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)
+
+</td>
+</tr>
+
+</table>
 For more details, refer to our [Integrations Documentation](#integrations).
 
 <div align="center">
@@ -1508,6 +1551,10 @@ For more details, refer to our [Integrations Documentation](#integrations).
     <a href="#-table-of-contents">
         <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
     </a>
+    </a>&nbsp;|&nbsp;
+    <a href="#-table-of-contents">
+        <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
+    </a>
 </div>
 
 ## Other Features
@@ -1529,6 +1576,158 @@ client.agents.tools.create(
         "setup": {"api_key": "your_brave_api_key"},
     },
 )
+
+Julep offers a range of advanced features to enhance your AI workflows:
+### Managing Sessions and Users
+### Adding Tools to Agents
+Julep provides robust session management for persistent interactions:
+Extend your agent's capabilities by integrating external tools and APIs:
+```python
+session = client.sessions.create(
+    agent_id=agent.id,
+    user_id=user.id,
+    context_overflow="adaptive"
+)
+```python
+# Continue conversation in the same session
+response = client.sessions.chat(
+    session_id=session.id,
+    messages=[
+      {
+        "role": "user",
+        "content": "Follow up on the previous conversation."
+      }
+    ]
+)
+```
+
+### Document Integration and Search
+
+Easily manage and search through documents for your agents:
+
+```python
+# Upload a document
+document = client.agents.docs.create(
+    title="AI advancements",
+    content="AI is changing the world...",
+    metadata={"category": "research_paper"}
+)
+
+# Search documents
+results = client.agents.docs.search(
+    text="AI advancements",
+    metadata_filter={"category": "research_paper"}
+)
+```
+    agent_id=agent.id,
+    name="web_search",
+    description="Search the web for information.",
+    integration={
+    </a>&nbsp;|&nbsp;
+    <a href="#-table-of-contents">
+        <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
+    </a>
+</div>
+
+## Reference
+
+### SDK Reference
+
+- **Node.js** [SDK Reference](https://github.com/julep-ai/node-sdk/blob/main/api.md) | [NPM Package](https://www.npmjs.com/package/@julep/sdk)
+- **Python** [SDK Reference](https://github.com/julep-ai/python-sdk/blob/main/api.md) | [PyPI Package](https://pypi.org/project/julep/)
+
+### API Reference
+
+Explore our API documentation to learn more about agents, tasks, and executions:
+
+- [Agents API](https://dev.julep.ai/api/docs#tag/agents)
+- [Tasks API](https://dev.julep.ai/api/docs#tag/tasks)
+- [Executions API](https://dev.julep.ai/api/docs#tag/executions)
+
+<div align="center">
+    <a href="#top">
+        <img src="https://img.shields.io/badge/Back%20to%20Top-000000?style=for-the-badge&logo=github&logoColor=white" alt="Back to Top">
+    </a>&nbsp;|&nbsp;
+    <a href="#-table-of-contents">
+        <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
+    </a>
+</div>
+
+## Local Quickstart
+
+**Requirements**:
+
+- latest docker compose installed
+
+**Steps**:
+
+1. `git clone https://github.com/julep-ai/julep.git`
+2. `cd julep`
+3. `docker volume create cozo_backup`
+4. `docker volume create cozo_data`
+5. `cp .env.example .env  # <-- Edit this file`
+6. `docker compose --env-file .env --profile temporal-ui --profile single-tenant --profile self-hosted-db up --build`
+
+<div align="center">
+    <a href="#top">
+        <img src="https://img.shields.io/badge/Back%20to%20Top-000000?style=for-the-badge&logo=github&logoColor=white" alt="Back to Top">
+    </a>&nbsp;|&nbsp;
+    <a href="#-table-of-contents">
+        <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
+    </a>
+</div>
+
+---
+
+## What's the difference between Julep and LangChain etc?
+
+### Different Use Cases
+
+Think of LangChain and Julep as tools with different focuses within the AI development stack.
+
+LangChain is great for creating sequences of prompts and managing interactions with LLMs. It has a large ecosystem with lots of pre-built integrations, which makes it convenient if you want to get something up and running quickly. LangChain fits well with simple use cases that involve a linear chain of prompts and API calls.
+
+Julep, on the other hand, is more about building persistent AI agents that can maintain context over long-term interactions. It shines when you need complex workflows that involve multi-step tasks, conditional logic, and integration with various tools or APIs directly within the agent's process. It's designed from the ground up to manage persistent sessions and complex workflows.
+
+Use Julep if you imagine building a complex AI assistant that needs to:
+
+- Keep track of user interactions over days or weeks.
+- Perform scheduled tasks, like sending daily summaries or monitoring data sources.
+- Make decisions based on prior interactions or stored data.
+- Interact with multiple external services as part of its workflow.
+
+Then Julep provides the infrastructure to support all that without you having to build it from scratch.
+
+### Different Form Factor
+
+Julep is a **platform** that includes a language for describing workflows, a server for running those workflows, and an SDK for interacting with the platform. In order to build something with Julep, you write a description of the workflow in `YAML`, and then run the workflow in the cloud.
+
+Julep is built for heavy-lifting, multi-step, and long-running workflows and there's no limit to how complex the workflow can be.
+
+LangChain is a **library** that includes a few tools and a framework for building linear chains of prompts and tools. In order to build something with LangChain, you typically write Python code that configures and runs the model chains you want to use.
+
+LangChain might be sufficient and quicker to implement for simple use cases that involve a linear chain of prompts and API calls.
+
+### In Summary
+
+Use LangChain when you need to manage LLM interactions and prompt sequences in a stateless or short-term context.
+
+Choose Julep when you need a robust framework for stateful agents with advanced workflow capabilities, persistent sessions, and complex task orchestration.
+
+<div align="center">
+    <a href="#top">
+        <img src="https://img.shields.io/badge/Back%20to%20Top-000000?style=for-the-badge&logo=github&logoColor=white" alt="Back to Top">
+    </a>&nbsp;|&nbsp;
+    <a href="#-table-of-contents">
+        <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
+    </a>
+</div>
+
+        "provider": "brave",
+        "method": "search",
+        "setup": {"api_key": "your_brave_api_key"},
+    },
+)
 ```
 
 ### Managing Sessions and Users
diff --git a/agents-api/agents_api/autogen/Tools.py b/agents-api/agents_api/autogen/Tools.py
@@ -1639,7 +1639,7 @@ class SpiderFetchArguments(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["scrape"] = "scrape"
+    mode: Literal["crawl", "scrape"] = "scrape"
     """
     The type of crawler to use
     """
@@ -1661,7 +1661,7 @@ class SpiderFetchArgumentsUpdate(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["scrape"] = "scrape"
+    mode: Literal["crawl", "scrape"] = "scrape"
     """
     The type of crawler to use
     """
diff --git a/agents-api/agents_api/routers/sessions/chat.py b/agents-api/agents_api/routers/sessions/chat.py
@@ -56,17 +56,6 @@ async def chat(
         ChatResponse: The chat response.
     """
 
-    # check if the developer is paid
-    if "paid" not in developer.tags:
-        # get the session length
-        sessions = count_sessions_query(developer_id=developer.id)
-        session_length = sessions["count"]
-        if session_length > max_free_sessions:
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail="Session length exceeded the free tier limit",
-            )
-
     # check if the developer is paid
     if "paid" not in developer.tags:
         # get the session length
@@ -108,7 +97,6 @@ async def chat(
         )
         for ref in doc_references
     ]
-
     # Render the system message
     if situation := chat_context.session.situation:
         system_message = dict(
diff --git a/integrations-service/gunicorn_conf.py b/integrations-service/gunicorn_conf.py
@@ -24,32 +24,6 @@
 preload_app = False
 
 
-def when_ready(server):
-    """Run when server is ready to handle requests."""
-    # Ensure proper permissions for any required directories
-    for directory in ["logs", "run"]:
-        path = os.path.join(os.getcwd(), directory)
-        if not os.path.exists(path):
-            os.makedirs(path, mode=0o755)
-
-
-def on_starting(server):
-    """Run when server starts."""
-    server.log.setup(server.app.cfg)
-
-
-def worker_exit(server, worker):
-    """Clean up on worker exit."""
-    server.log.info(f"Worker {worker.pid} exiting gracefully")
-
-
-loglevel = "info"
-graceful_timeout = 30
-max_requests = 1000
-max_requests_jitter = 50
-preload_app = False
-
-
 def when_ready(server):
     """Run when server is ready to handle requests."""
     # Ensure proper permissions for any required directories
diff --git a/integrations-service/integrations/autogen/Tools.py b/integrations-service/integrations/autogen/Tools.py
@@ -1639,7 +1639,7 @@ class SpiderFetchArguments(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["scrape"] = "scrape"
+    mode: Literal["crawl", "scrape"] = "scrape"
     """
     The type of crawler to use
     """
@@ -1661,7 +1661,7 @@ class SpiderFetchArgumentsUpdate(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["scrape"] = "scrape"
+    mode: Literal["crawl", "scrape"] = "scrape"
     """
     The type of crawler to use
     """
diff --git a/integrations-service/integrations/utils/integrations/ffmpeg.py b/integrations-service/integrations/utils/integrations/ffmpeg.py
@@ -67,7 +67,7 @@ async def bash_cmd(arguments: FfmpegSearchArguments) -> FfmpegSearchOutput:
         # Decode base64 input
         try:
             input_data = base64.b64decode(arguments.file)
-        except Exception as e:
+        except Exception:
             return FfmpegSearchOutput(
                 fileoutput="Error: Invalid base64 input", result=False, mime_type=None
             )
diff --git a/integrations-service/integrations/utils/integrations/spider.py b/integrations-service/integrations/utils/integrations/spider.py
@@ -1,5 +1,3 @@
-import asyncio
-
 from beartype import beartype
 from langchain_community.document_loaders import SpiderLoader
 from tenacity import retry, stop_after_attempt, wait_exponential
@@ -45,5 +43,5 @@ async def crawl(
         params=arguments.params,
     )
 
-    documents = await asyncio.to_thread(spider_loader.load)
+    documents = await spider_loader.aload()
     return SpiderFetchOutput(documents=documents)
diff --git a/typespec/tools/spider.tsp b/typespec/tools/spider.tsp
diff --git a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ async def bash_cmd(arguments: FfmpegSearchArguments) -> FfmpegSearchOutput:`
`67`	`67`	`# Decode base64 input`
`68`	`68`	`try:`
`69`	`69`	`input_data = base64.b64decode(arguments.file)`
`70`		`- except Exception as e:`
	`70`	`+ except Exception:`
`71`	`71`	`return FfmpegSearchOutput(`
`72`	`72`	`fileoutput="Error: Invalid base64 input", result=False, mime_type=None`
`73`	`73`	`)`