microsoft · lspinheiro · Dec 7, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 17, 2024
diff --git a/python/packages/autogen-core/docs/src/reference/index.md b/python/packages/autogen-core/docs/src/reference/index.md
@@ -49,6 +49,7 @@ python/autogen_ext.teams.magentic_one
 python/autogen_ext.models.openai
 python/autogen_ext.models.replay
 python/autogen_ext.tools.langchain
+python/autogen_ext.tools.graphrag
 python/autogen_ext.code_executors.local
 python/autogen_ext.code_executors.docker
 python/autogen_ext.code_executors.azure

diff --git a/.../packages/autogen-core/docs/src/reference/python/autogen_ext.tools.graphrag.rst b/.../packages/autogen-core/docs/src/reference/python/autogen_ext.tools.graphrag.rst
@@ -0,0 +1,8 @@
+autogen\_ext.tools.graphrag
+===========================
+
+
+.. automodule:: autogen_ext.tools.graphrag
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml
@@ -30,6 +30,7 @@ file-surfer = [
     "autogen-agentchat==0.4.0.dev12",
     "markitdown>=0.0.1a2",
 ]
+graphrag = ["graphrag>=1.0.1"]
 web-surfer = [
     "autogen-agentchat==0.4.0.dev12",
     "playwright>=1.48.0",

diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/__init__.py b/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/__init__.py
@@ -0,0 +1,23 @@
+from ._config import (
+    EmbeddingConfig,
+    GlobalContextConfig,
+    GlobalDataConfig,
+    LocalContextConfig,
+    LocalDataConfig,
+    MapReduceConfig,
+    SearchConfig,
+)
+from ._global_search import GlobalSearchTool
+from ._local_search import LocalSearchTool
+
+__all__ = [
+    "GlobalSearchTool",
+    "LocalSearchTool",
+    "GlobalDataConfig",
+    "LocalDataConfig",
+    "GlobalContextConfig",
+    "LocalContextConfig",
+    "MapReduceConfig",
+    "SearchConfig",
+    "EmbeddingConfig",
+]
diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_config.py b/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_config.py
@@ -0,0 +1,74 @@
+from typing import Callable, Literal, Optional
+
+from pydantic import BaseModel
+
+
+class DataConfig(BaseModel):
+    input_dir: str
+    entity_table: str = "create_final_nodes"
+    entity_embedding_table: str = "create_final_entities"
+    community_level: int = 2
+
+
+class GlobalDataConfig(DataConfig):
+    community_table: str = "create_final_communities"
+    community_report_table: str = "create_final_community_reports"
+
+
+class LocalDataConfig(DataConfig):
+    relationship_table: str = "create_final_relationships"
+    text_unit_table: str = "create_final_text_units"
+
+
+class ContextConfig(BaseModel):
+    max_data_tokens: int = 8000
+
+
+class GlobalContextConfig(ContextConfig):
+    use_community_summary: bool = False
+    shuffle_data: bool = True
+    include_community_rank: bool = True
+    min_community_rank: int = 0
+    community_rank_name: str = "rank"
+    include_community_weight: bool = True
+    community_weight_name: str = "occurrence weight"
+    normalize_community_weight: bool = True
+    max_data_tokens: int = 12000
+
+
+class LocalContextConfig(ContextConfig):
+    text_unit_prop: float = 0.5
+    community_prop: float = 0.25
+    include_entity_rank: bool = True
+    rank_description: str = "number of relationships"
+    include_relationship_weight: bool = True
+    relationship_ranking_attribute: str = "rank"
+
+
+class MapReduceConfig(BaseModel):
+    map_max_tokens: int = 1000
+    map_temperature: float = 0.0
+    reduce_max_tokens: int = 2000
+    reduce_temperature: float = 0.0
+    allow_general_knowledge: bool = False
+    json_mode: bool = False
+    response_type: str = "multiple paragraphs"
+
+
+class SearchConfig(BaseModel):
+    max_tokens: int = 1500
+    temperature: float = 0.0
+    response_type: str = "multiple paragraphs"
+
+
+class EmbeddingConfig(BaseModel):
+    api_key: Optional[str] = None
+    model: str
+    api_base: Optional[str] = None
+    deployment_name: Optional[str] = None
+    api_version: Optional[str] = None
+    api_type: Literal["azure", "openai"] = "openai"
+    organization: Optional[str] = None
+    azure_ad_token_provider: Optional[Callable[[], str]] = None
+    max_retries: int = 10
+    request_timeout: float = 180.0
diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py b/python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py
@@ -0,0 +1,211 @@
+# mypy: disable-error-code="no-any-unimported,misc"
+from pathlib import Path
+
+import pandas as pd
+import tiktoken
+from autogen_core import CancellationToken
+from autogen_core.tools import BaseTool
+from graphrag.config.config_file_loader import load_config_from_file
+from graphrag.query.indexer_adapters import (
+    read_indexer_communities,
+    read_indexer_entities,
+    read_indexer_reports,
+)
+from graphrag.query.llm.base import BaseLLM
+from graphrag.query.llm.get_client import get_llm
+from graphrag.query.structured_search.global_search.community_context import GlobalCommunityContext
+from graphrag.query.structured_search.global_search.search import GlobalSearch
+from pydantic import BaseModel, Field
+
+from ._config import GlobalContextConfig as ContextConfig
+from ._config import GlobalDataConfig as DataConfig
+from ._config import MapReduceConfig
+
+_default_context_config = ContextConfig()
+_default_mapreduce_config = MapReduceConfig()
+
+
+class GlobalSearchToolArgs(BaseModel):
+    query: str = Field(..., description="The user query to perform global search on.")
+
+
+class GlobalSearchToolReturn(BaseModel):
+    answer: str
+
+
+class GlobalSearchTool(BaseTool[GlobalSearchToolArgs, GlobalSearchToolReturn]):
+    """Enables running GraphRAG global search queries as an AutoGen tool.
+
+    This tool allows you to perform semantic search over a corpus of documents using the GraphRAG framework.
+    The search combines graph-based document relationships with semantic embeddings to find relevant information.
+    Example usage with AssistantAgent:
+
+    .. code-block:: python
+
+        import asyncio
+        from autogen_ext.models.openai import AzureOpenAIChatCompletionClient
+        from autogen_ext.tools.graphrag import GlobalSearchTool
+        from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+        from autogen_agentchat.agents import AssistantAgent
+
+
+        async def main():
+            # Initialize the OpenAI client
+            openai_client = OpenAIChatCompletionClient(
+                model="gpt-4o-mini",
+                api_key="<api-key>",
+            )
+
+            # Set up global search tool
+            global_tool = GlobalSearchTool.from_settings(settings_path="./settings.yaml")
+
+            # Create assistant agent with the global search tool
+            assistant_agent = AssistantAgent(
+                name="search_assistant",
+                tools=[global_tool],
+                model_client=openai_client,
+                system_message=(
+                    "You are a tool selector AI assistant using the GraphRAG framework. "
+                    "Your primary task is to determine the appropriate search tool to call based on the user's query. "
+                    "For broader, abstract questions requiring a comprehensive understanding of the dataset, call the 'global_search' function."
+                ),
+            )
+
+            # Run a sample query
+            query = "What is the overall sentiment of the community reports?"
+            response_stream = assistant_agent.run_stream(task=query)
+            async for msg in response_stream:
+                if hasattr(msg, "content"):
+                    print(f"\nAgent response: {msg.content}")
+
+
+        if __name__ == "__main__":
+            asyncio.run(main())
+
+    .. note::
+
+        This tool requires the :code:`graphrag` extra for the :code:`autogen-ext` package.
+        This tool requires indexed data created by the GraphRAG indexing process. See the [GraphRAG documentation](https://microsoft.github.io/graphrag/)
+        for details on how to prepare the required data files.
+
+
+    Args:
+        token_encoder (tiktoken.Encoding): The tokenizer used for text encoding
+        llm (BaseLLM): The language model to use for search
+        data_config (DataConfig): Configuration for data source locations and settings
+        context_config (ContextConfig, optional): Configuration for context building. Defaults to default config.
+        mapreduce_config (MapReduceConfig, optional): Configuration for map-reduce operations. Defaults to default config.
+    """
+
+    def __init__(
+        self,
+        token_encoder: tiktoken.Encoding,
+        llm: BaseLLM,
+        data_config: DataConfig,
+        context_config: ContextConfig = _default_context_config,
+        mapreduce_config: MapReduceConfig = _default_mapreduce_config,
+    ):
+        super().__init__(
+            args_type=GlobalSearchToolArgs,
+            return_type=GlobalSearchToolReturn,
+            name="global_search_tool",
+            description="Perform a global search with given parameters using graphrag.",
+        )
+        # Use the provided LLM
+        self._llm = llm
+
+        # Load parquet files
+        community_df: pd.DataFrame = pd.read_parquet(f"{data_config.input_dir}/{data_config.community_table}.parquet")  # type: ignore
+        entity_df: pd.DataFrame = pd.read_parquet(f"{data_config.input_dir}/{data_config.entity_table}.parquet")  # type: ignore
+        report_df: pd.DataFrame = pd.read_parquet(  # type: ignore
+            f"{data_config.input_dir}/{data_config.community_report_table}.parquet"
+        )
+        entity_embedding_df: pd.DataFrame = pd.read_parquet(  # type: ignore
+            f"{data_config.input_dir}/{data_config.entity_embedding_table}.parquet"
+        )
+
+        communities = read_indexer_communities(community_df, entity_df, report_df)
+        reports = read_indexer_reports(report_df, entity_df, data_config.community_level)
+        entities = read_indexer_entities(entity_df, entity_embedding_df, data_config.community_level)
+
+        context_builder = GlobalCommunityContext(
+            community_reports=reports,
+            communities=communities,
+            entities=entities,
+            token_encoder=token_encoder,
+        )
+
+        context_builder_params = {
+            "use_community_summary": context_config.use_community_summary,
+            "shuffle_data": context_config.shuffle_data,
+            "include_community_rank": context_config.include_community_rank,
+            "min_community_rank": context_config.min_community_rank,
+            "community_rank_name": context_config.community_rank_name,
+            "include_community_weight": context_config.include_community_weight,
+            "community_weight_name": context_config.community_weight_name,
+            "normalize_community_weight": context_config.normalize_community_weight,
+            "max_tokens": context_config.max_data_tokens,
+            "context_name": "Reports",
+        }
+
+        map_llm_params = {
+            "max_tokens": mapreduce_config.map_max_tokens,
+            "temperature": mapreduce_config.map_temperature,
+            "response_format": {"type": "json_object"},
+        }
+
+        reduce_llm_params = {
+            "max_tokens": mapreduce_config.reduce_max_tokens,
+            "temperature": mapreduce_config.reduce_temperature,
+        }
+
+        self._search_engine = GlobalSearch(
+            llm=self._llm,
+            context_builder=context_builder,
+            token_encoder=token_encoder,
+            max_data_tokens=context_config.max_data_tokens,
+            map_llm_params=map_llm_params,
+            reduce_llm_params=reduce_llm_params,
+            allow_general_knowledge=mapreduce_config.allow_general_knowledge,
+            json_mode=mapreduce_config.json_mode,
+            context_builder_params=context_builder_params,
+            concurrent_coroutines=32,
+            response_type=mapreduce_config.response_type,
+        )
+
+    async def run(self, args: GlobalSearchToolArgs, cancellation_token: CancellationToken) -> GlobalSearchToolReturn:
+        result = await self._search_engine.asearch(args.query)
+        assert isinstance(result.response, str), "Expected response to be a string"
+        return GlobalSearchToolReturn(answer=result.response)
+
+    @classmethod
+    def from_settings(cls, settings_path: str | Path) -> "GlobalSearchTool":
+        """Create a GlobalSearchTool instance from GraphRAG settings file.
+
+        Args:
+            settings_path: Path to the GraphRAG settings.yaml file
+
+        Returns:
+            An initialized GlobalSearchTool instance
+        """
+        # Load GraphRAG config
+        config = load_config_from_file(settings_path)
+
+        # Initialize token encoder
+        token_encoder = tiktoken.get_encoding(config.encoding_model)
+
+        # Initialize LLM using graphrag's get_client
+        llm = get_llm(config)
+
+        # Create data config from storage paths
+        data_config = DataConfig(
+            input_dir=str(Path(config.storage.base_dir)),
+        )
+
+        return cls(
+            token_encoder=token_encoder,
+            llm=llm,
+            data_config=data_config,
+            context_config=_default_context_config,
+            mapreduce_config=_default_mapreduce_config,
+        )