microsoft
diff --git a/‎.github/workflows/python-smoke-tests.yml
+2-3 b/‎.github/workflows/python-smoke-tests.yml
+2-3
diff --git a/‎.semversioner/next-release/major-20250204190309656024.json
+4 b/‎.semversioner/next-release/major-20250204190309656024.json
+4
diff --git a/‎docs/examples_notebooks/api_overview.ipynb
+6-13 b/‎docs/examples_notebooks/api_overview.ipynb
+6-13
diff --git a/‎docs/examples_notebooks/drift_search.ipynb
+9-9 b/‎docs/examples_notebooks/drift_search.ipynb
+9-9
diff --git a/‎docs/examples_notebooks/global_search.ipynb
+9-11 b/‎docs/examples_notebooks/global_search.ipynb
+9-11
diff --git a/‎docs/examples_notebooks/global_search_with_dynamic_community_selection.ipynb
+9-11 b/‎docs/examples_notebooks/global_search_with_dynamic_community_selection.ipynb
+9-11
diff --git a/‎docs/examples_notebooks/index_migration.ipynb renamed to ‎docs/examples_notebooks/index_migration_to_v1.ipynb
+25-18 b/‎docs/examples_notebooks/index_migration.ipynb renamed to ‎docs/examples_notebooks/index_migration_to_v1.ipynb
+25-18
@@ -51,10 +51,9 @@ jobs:
       GRAPHRAG_CACHE_BASE_DIR": "cache"
       GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
       GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
-      GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL: ${{ secrets.GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL }}
       # We have Windows + Linux runners in 3.10 and 3.11, so we need to divide the rate limits by 4
-      GRAPHRAG_LLM_TPM: 45_000 # 180,000 / 4
-      GRAPHRAG_LLM_RPM: 270 # 1,080 / 4
+      GRAPHRAG_LLM_TPM: 100_000 # 400_000 / 4
+      GRAPHRAG_LLM_RPM: 500 # 2_000 / 4
       GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4
       GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4
       GRAPHRAG_CHUNK_SIZE: 1200
 
@@ -0,0 +1,4 @@
+{
+  "type": "major",
+  "description": "Reorganize and rename workflows and their outputs."
+}
@@ -49,7 +49,8 @@
    "source": [
     "import yaml\n",
     "\n",
-    "settings = yaml.safe_load(open(\"<project_directory>/settings.yaml\"))  # noqa: PTH123, SIM115"
+    "PROJECT_DIRECTORY = \"<project_directory>\"\n",
+    "settings = yaml.safe_load(open(f\"{PROJECT_DIRECTORY}/settings.yaml\"))  # noqa: PTH123, SIM115"
    ]
   },
   {
@@ -74,9 +75,7 @@
    "source": [
     "from graphrag.config.create_graphrag_config import create_graphrag_config\n",
     "\n",
-    "graphrag_config = create_graphrag_config(\n",
-    "    values=settings, root_dir=\"<project_directory>\"\n",
-    ")"
+    "graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)"
    ]
   },
   {
@@ -126,20 +125,14 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "final_nodes = pd.read_parquet(\"<project_directory>/output/create_final_nodes.parquet\")\n",
-    "final_entities = pd.read_parquet(\n",
-    "    \"<project_directory>/output/create_final_entities.parquet\"\n",
-    ")\n",
-    "final_communities = pd.read_parquet(\n",
-    "    \"<project_directory>/output/create_final_communities.parquet\"\n",
-    ")\n",
+    "final_entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
+    "final_communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
     "final_community_reports = pd.read_parquet(\n",
-    "    \"<project_directory>/output/create_final_community_reports.parquet\"\n",
+    "    f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
     ")\n",
     "\n",
     "response, context = await api.global_search(\n",
     "    config=graphrag_config,\n",
-    "    nodes=final_nodes,\n",
     "    entities=final_entities,\n",
     "    communities=final_communities,\n",
     "    community_reports=final_community_reports,\n",
 
@@ -194,22 +194,22 @@
     "INPUT_DIR = \"./inputs/operation dulce\"\n",
     "LANCEDB_URI = f\"{INPUT_DIR}/lancedb\"\n",
     "\n",
-    "COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
-    "ENTITY_TABLE = \"create_final_nodes\"\n",
-    "ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
-    "RELATIONSHIP_TABLE = \"create_final_relationships\"\n",
-    "COVARIATE_TABLE = \"create_final_covariates\"\n",
-    "TEXT_UNIT_TABLE = \"create_final_text_units\"\n",
+    "COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
+    "COMMUNITY_TABLE = \"communities\"\n",
+    "ENTITY_TABLE = \"entities\"\n",
+    "RELATIONSHIP_TABLE = \"relationships\"\n",
+    "COVARIATE_TABLE = \"covariates\"\n",
+    "TEXT_UNIT_TABLE = \"text_units\"\n",
     "COMMUNITY_LEVEL = 2\n",
     "\n",
     "\n",
     "# read nodes table to get community and degree data\n",
     "entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
-    "entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
+    "community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
     "\n",
     "print(f\"Entity df columns: {entity_df.columns}\")\n",
     "\n",
-    "entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
+    "entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)\n",
     "\n",
     "# load description embeddings to an in-memory lancedb vectorstore\n",
     "# to connect to a remote db, specify url and port values.\n",
@@ -286,7 +286,7 @@
     "report_df = read_community_reports(INPUT_DIR)\n",
     "reports = read_indexer_reports(\n",
     "    report_df,\n",
-    "    entity_df,\n",
+    "    community_df,\n",
     "    COMMUNITY_LEVEL,\n",
     "    content_embedding_col=\"full_content_embeddings\",\n",
     ")\n",
 
@@ -75,9 +75,9 @@
    "source": [
     "### Load community reports as context for global search\n",
     "\n",
-    "- Load all community reports in the `create_final_community_reports` table from the GraphRAG, to be used as context data for global search.\n",
-    "- Load entities from the `create_final_nodes` and `create_final_entities` tables from the GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
-    "- Load all communities in the `create_final_communites` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection."
+    "- Load all community reports in the `community_reports` table from GraphRAG, to be used as context data for global search.\n",
+    "- Load entities from the `entities` tables from GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
+    "- Load all communities in the `communities` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection."
    ]
   },
   {
@@ -88,10 +88,9 @@
    "source": [
     "# parquet files generated from indexing pipeline\n",
     "INPUT_DIR = \"./inputs/operation dulce\"\n",
-    "COMMUNITY_TABLE = \"create_final_communities\"\n",
-    "COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
-    "ENTITY_TABLE = \"create_final_nodes\"\n",
-    "ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
+    "COMMUNITY_TABLE = \"communities\"\n",
+    "COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
+    "ENTITY_TABLE = \"entities\"\n",
     "\n",
     "# community level in the Leiden community hierarchy from which we will load the community reports\n",
     "# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)\n",
@@ -298,11 +297,10 @@
     "community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
     "entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
     "report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
-    "entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
     "\n",
-    "communities = read_indexer_communities(community_df, entity_df, report_df)\n",
-    "reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n",
-    "entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
+    "communities = read_indexer_communities(community_df, report_df)\n",
+    "reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)\n",
+    "entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)\n",
     "\n",
     "print(f\"Total report count: {len(report_df)}\")\n",
     "print(\n",
 
@@ -75,9 +75,9 @@
    "source": [
     "### Load community reports as context for global search\n",
     "\n",
-    "- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.\n",
-    "- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
-    "- Load all communities in the `create_final_communites` table from the ire-indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
+    "- Load all community reports in the `community_reports` table from the indexing engine, to be used as context data for global search.\n",
+    "- Load entities from the `entities` tables from the indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
+    "- Load all communities in the `communities` table from the indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
    ]
   },
   {
@@ -88,10 +88,9 @@
    "source": [
     "# parquet files generated from indexing pipeline\n",
     "INPUT_DIR = \"./inputs/operation dulce\"\n",
-    "COMMUNITY_TABLE = \"create_final_communities\"\n",
-    "COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
-    "ENTITY_TABLE = \"create_final_nodes\"\n",
-    "ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
+    "COMMUNITY_TABLE = \"communities\"\n",
+    "COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
+    "ENTITY_TABLE = \"entities\"\n",
     "\n",
     "# we don't fix a specific community level but instead use an agent to dynamicially\n",
     "# search through all the community reports to check if they are relevant.\n",
@@ -274,17 +273,16 @@
     "community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
     "entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
     "report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
-    "entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
     "\n",
-    "communities = read_indexer_communities(community_df, entity_df, report_df)\n",
+    "communities = read_indexer_communities(community_df, report_df)\n",
     "reports = read_indexer_reports(\n",
     "    report_df,\n",
-    "    entity_df,\n",
+    "    community_df,\n",
     "    community_level=COMMUNITY_LEVEL,\n",
     "    dynamic_community_selection=True,\n",
     ")\n",
     "entities = read_indexer_entities(\n",
-    "    entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL\n",
+    "    entity_df, community_df, community_level=COMMUNITY_LEVEL\n",
     ")\n",
     "\n",
     "print(f\"Total report count: {len(report_df)}\")\n",
 
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,9 +14,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Index Migration\n",
+    "## Index Migration (pre-v1 to v1)\n",
     "\n",
-    "This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
+    "This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
     "\n",
     "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
     "\n",
@@ -25,40 +25,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This is the directory that has your settings.yml\n",
+    "# This is the directory that has your settings.yaml\n",
     "# NOTE: much older indexes may have been output with a timestamped directory\n",
-    "# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n",
-    "PROJECT_DIRECTORY = \"<your project directory>\""
+    "# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n",
+    "PROJECT_DIRECTORY = \"<your project directory\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.config.resolve_path import resolve_paths\n",
     "from graphrag.storage.factory import StorageFactory\n",
     "\n",
-    "# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n",
     "config = load_config(Path(PROJECT_DIRECTORY))\n",
-    "resolve_paths(config)\n",
-    "storage_config = config.storage.model_dump()  # type: ignore\n",
+    "storage_config = config.output.model_dump()\n",
     "storage = StorageFactory().create_storage(\n",
-    "    storage_type=storage_config[\"type\"], kwargs=storage_config\n",
+    "    storage_type=storage_config[\"type\"],\n",
+    "    kwargs=storage_config,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,7 +132,7 @@
     "if \"name\" in final_entities.columns:\n",
     "    final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
     "remove_columns(\n",
-    "    final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
+    "    final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
     ")\n",
     "\n",
     "# Final nodes uses community for joins, which is now an int everywhere\n",
@@ -168,6 +166,15 @@
     "    final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
     "if \"parent\" not in final_communities.columns:\n",
     "    final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
+    "if \"entity_ids\" not in final_communities.columns:\n",
+    "    node_mapping = (\n",
+    "        final_nodes.loc[:, [\"community\", \"id\"]]\n",
+    "        .groupby(\"community\")\n",
+    "        .agg(entity_ids=(\"id\", list))\n",
+    "    )\n",
+    "    final_communities = final_communities.merge(\n",
+    "        node_mapping, on=\"community\", how=\"left\"\n",
+    "    )\n",
     "remove_columns(final_communities, [\"raw_community\"])\n",
     "\n",
     "# We need int for community and the human_readable_id copy for consistency\n",
@@ -197,7 +204,7 @@
    "source": [
     "from graphrag.cache.factory import CacheFactory\n",
     "from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
-    "from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings\n",
+    "from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n",
     "from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
     "\n",
     "# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",