Skip to content

Commit c02ab09

Browse files
authored
Streamline workflows (#1674)
* Remove create_final_nodes * Rename final entity output to "entities" * Remove duplicate code from graph extraction * Rename create_final_relationships output to "relationships" * Rename create_final_communities output to "communities" * Combine compute_communities and create_final_communities * Rename create_final_covariates output to "covariates" * Rename create_final_community_reports output to "community_reports" * Rename create_final_text_units output to "text_units" * Rename create_final_documents output to "documents" * Remove transient snapshots config * Move create_final_entities to finalize_entities operation * Move create_final_relationships flow to finalize_relationships operation * Reuse some community report functions * Collapse most of graph and text unit-based report generation * Unify schemas files * Move community reports extractor * Move NLP report prompt to prompts folder * Fix a few pandas warnings * Rename embeddings config to embed_text * Rename claim_extraction config to extract_claims * Remove nltk from standard graph extraction * Fix verb tests * Fix extract graph config naming * Fix moved file reference * Create v1-to-v2 migration notebook * Semver * Fix smoke test artifact count * Raise tpm/rpm on smoke tests * Update drift settings for smoke tests * Reuse project directory var in api notebook * Format * Format
1 parent 83cc2da commit c02ab09

File tree

144 files changed

+1685
-2576
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

144 files changed

+1685
-2576
lines changed

.github/workflows/python-smoke-tests.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,9 @@ jobs:
5151
GRAPHRAG_CACHE_BASE_DIR": "cache"
5252
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
5353
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
54-
GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL: ${{ secrets.GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL }}
5554
# We have Windows + Linux runners in 3.10 and 3.11, so we need to divide the rate limits by 4
56-
GRAPHRAG_LLM_TPM: 45_000 # 180,000 / 4
57-
GRAPHRAG_LLM_RPM: 270 # 1,080 / 4
55+
GRAPHRAG_LLM_TPM: 100_000 # 400_000 / 4
56+
GRAPHRAG_LLM_RPM: 500 # 2_000 / 4
5857
GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4
5958
GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4
6059
GRAPHRAG_CHUNK_SIZE: 1200
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "major",
3+
"description": "Reorganize and rename workflows and their outputs."
4+
}

docs/examples_notebooks/api_overview.ipynb

+6-13
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
"source": [
5050
"import yaml\n",
5151
"\n",
52-
"settings = yaml.safe_load(open(\"<project_directory>/settings.yaml\")) # noqa: PTH123, SIM115"
52+
"PROJECT_DIRECTORY = \"<project_directory>\"\n",
53+
"settings = yaml.safe_load(open(f\"{PROJECT_DIRECTORY}/settings.yaml\")) # noqa: PTH123, SIM115"
5354
]
5455
},
5556
{
@@ -74,9 +75,7 @@
7475
"source": [
7576
"from graphrag.config.create_graphrag_config import create_graphrag_config\n",
7677
"\n",
77-
"graphrag_config = create_graphrag_config(\n",
78-
" values=settings, root_dir=\"<project_directory>\"\n",
79-
")"
78+
"graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)"
8079
]
8180
},
8281
{
@@ -126,20 +125,14 @@
126125
"source": [
127126
"import pandas as pd\n",
128127
"\n",
129-
"final_nodes = pd.read_parquet(\"<project_directory>/output/create_final_nodes.parquet\")\n",
130-
"final_entities = pd.read_parquet(\n",
131-
" \"<project_directory>/output/create_final_entities.parquet\"\n",
132-
")\n",
133-
"final_communities = pd.read_parquet(\n",
134-
" \"<project_directory>/output/create_final_communities.parquet\"\n",
135-
")\n",
128+
"final_entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
129+
"final_communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
136130
"final_community_reports = pd.read_parquet(\n",
137-
" \"<project_directory>/output/create_final_community_reports.parquet\"\n",
131+
" f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
138132
")\n",
139133
"\n",
140134
"response, context = await api.global_search(\n",
141135
" config=graphrag_config,\n",
142-
" nodes=final_nodes,\n",
143136
" entities=final_entities,\n",
144137
" communities=final_communities,\n",
145138
" community_reports=final_community_reports,\n",

docs/examples_notebooks/drift_search.ipynb

+9-9
Original file line numberDiff line numberDiff line change
@@ -194,22 +194,22 @@
194194
"INPUT_DIR = \"./inputs/operation dulce\"\n",
195195
"LANCEDB_URI = f\"{INPUT_DIR}/lancedb\"\n",
196196
"\n",
197-
"COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
198-
"ENTITY_TABLE = \"create_final_nodes\"\n",
199-
"ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
200-
"RELATIONSHIP_TABLE = \"create_final_relationships\"\n",
201-
"COVARIATE_TABLE = \"create_final_covariates\"\n",
202-
"TEXT_UNIT_TABLE = \"create_final_text_units\"\n",
197+
"COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
198+
"COMMUNITY_TABLE = \"communities\"\n",
199+
"ENTITY_TABLE = \"entities\"\n",
200+
"RELATIONSHIP_TABLE = \"relationships\"\n",
201+
"COVARIATE_TABLE = \"covariates\"\n",
202+
"TEXT_UNIT_TABLE = \"text_units\"\n",
203203
"COMMUNITY_LEVEL = 2\n",
204204
"\n",
205205
"\n",
206206
"# read nodes table to get community and degree data\n",
207207
"entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
208-
"entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
208+
"community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
209209
"\n",
210210
"print(f\"Entity df columns: {entity_df.columns}\")\n",
211211
"\n",
212-
"entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
212+
"entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)\n",
213213
"\n",
214214
"# load description embeddings to an in-memory lancedb vectorstore\n",
215215
"# to connect to a remote db, specify url and port values.\n",
@@ -286,7 +286,7 @@
286286
"report_df = read_community_reports(INPUT_DIR)\n",
287287
"reports = read_indexer_reports(\n",
288288
" report_df,\n",
289-
" entity_df,\n",
289+
" community_df,\n",
290290
" COMMUNITY_LEVEL,\n",
291291
" content_embedding_col=\"full_content_embeddings\",\n",
292292
")\n",

docs/examples_notebooks/global_search.ipynb

+9-11
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@
7575
"source": [
7676
"### Load community reports as context for global search\n",
7777
"\n",
78-
"- Load all community reports in the `create_final_community_reports` table from the GraphRAG, to be used as context data for global search.\n",
79-
"- Load entities from the `create_final_nodes` and `create_final_entities` tables from the GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
80-
"- Load all communities in the `create_final_communites` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection."
78+
"- Load all community reports in the `community_reports` table from GraphRAG, to be used as context data for global search.\n",
79+
"- Load entities from the `entities` tables from GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
80+
"- Load all communities in the `communities` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection."
8181
]
8282
},
8383
{
@@ -88,10 +88,9 @@
8888
"source": [
8989
"# parquet files generated from indexing pipeline\n",
9090
"INPUT_DIR = \"./inputs/operation dulce\"\n",
91-
"COMMUNITY_TABLE = \"create_final_communities\"\n",
92-
"COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
93-
"ENTITY_TABLE = \"create_final_nodes\"\n",
94-
"ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
91+
"COMMUNITY_TABLE = \"communities\"\n",
92+
"COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
93+
"ENTITY_TABLE = \"entities\"\n",
9594
"\n",
9695
"# community level in the Leiden community hierarchy from which we will load the community reports\n",
9796
"# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)\n",
@@ -298,11 +297,10 @@
298297
"community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
299298
"entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
300299
"report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
301-
"entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
302300
"\n",
303-
"communities = read_indexer_communities(community_df, entity_df, report_df)\n",
304-
"reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n",
305-
"entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
301+
"communities = read_indexer_communities(community_df, report_df)\n",
302+
"reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)\n",
303+
"entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)\n",
306304
"\n",
307305
"print(f\"Total report count: {len(report_df)}\")\n",
308306
"print(\n",

docs/examples_notebooks/global_search_with_dynamic_community_selection.ipynb

+9-11
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@
7575
"source": [
7676
"### Load community reports as context for global search\n",
7777
"\n",
78-
"- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.\n",
79-
"- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
80-
"- Load all communities in the `create_final_communites` table from the ire-indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
78+
"- Load all community reports in the `community_reports` table from the indexing engine, to be used as context data for global search.\n",
79+
"- Load entities from the `entities` tables from the indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
80+
"- Load all communities in the `communities` table from the indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
8181
]
8282
},
8383
{
@@ -88,10 +88,9 @@
8888
"source": [
8989
"# parquet files generated from indexing pipeline\n",
9090
"INPUT_DIR = \"./inputs/operation dulce\"\n",
91-
"COMMUNITY_TABLE = \"create_final_communities\"\n",
92-
"COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
93-
"ENTITY_TABLE = \"create_final_nodes\"\n",
94-
"ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
91+
"COMMUNITY_TABLE = \"communities\"\n",
92+
"COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
93+
"ENTITY_TABLE = \"entities\"\n",
9594
"\n",
9695
"# we don't fix a specific community level but instead use an agent to dynamicially\n",
9796
"# search through all the community reports to check if they are relevant.\n",
@@ -274,17 +273,16 @@
274273
"community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
275274
"entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
276275
"report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
277-
"entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
278276
"\n",
279-
"communities = read_indexer_communities(community_df, entity_df, report_df)\n",
277+
"communities = read_indexer_communities(community_df, report_df)\n",
280278
"reports = read_indexer_reports(\n",
281279
" report_df,\n",
282-
" entity_df,\n",
280+
" community_df,\n",
283281
" community_level=COMMUNITY_LEVEL,\n",
284282
" dynamic_community_selection=True,\n",
285283
")\n",
286284
"entities = read_indexer_entities(\n",
287-
" entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL\n",
285+
" entity_df, community_df, community_level=COMMUNITY_LEVEL\n",
288286
")\n",
289287
"\n",
290288
"print(f\"Total report count: {len(report_df)}\")\n",

docs/examples_notebooks/index_migration.ipynb renamed to docs/examples_notebooks/index_migration_to_v1.ipynb

+25-18
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 66,
5+
"execution_count": 62,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -14,9 +14,9 @@
1414
"cell_type": "markdown",
1515
"metadata": {},
1616
"source": [
17-
"## Index Migration\n",
17+
"## Index Migration (pre-v1 to v1)\n",
1818
"\n",
19-
"This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
19+
"This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
2020
"\n",
2121
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
2222
"\n",
@@ -25,40 +25,38 @@
2525
},
2626
{
2727
"cell_type": "code",
28-
"execution_count": 67,
28+
"execution_count": 63,
2929
"metadata": {},
3030
"outputs": [],
3131
"source": [
32-
"# This is the directory that has your settings.yml\n",
32+
"# This is the directory that has your settings.yaml\n",
3333
"# NOTE: much older indexes may have been output with a timestamped directory\n",
34-
"# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n",
35-
"PROJECT_DIRECTORY = \"<your project directory>\""
34+
"# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n",
35+
"PROJECT_DIRECTORY = \"<your project directory\""
3636
]
3737
},
3838
{
3939
"cell_type": "code",
40-
"execution_count": null,
40+
"execution_count": 64,
4141
"metadata": {},
4242
"outputs": [],
4343
"source": [
4444
"from pathlib import Path\n",
4545
"\n",
4646
"from graphrag.config.load_config import load_config\n",
47-
"from graphrag.config.resolve_path import resolve_paths\n",
4847
"from graphrag.storage.factory import StorageFactory\n",
4948
"\n",
50-
"# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n",
5149
"config = load_config(Path(PROJECT_DIRECTORY))\n",
52-
"resolve_paths(config)\n",
53-
"storage_config = config.storage.model_dump() # type: ignore\n",
50+
"storage_config = config.output.model_dump()\n",
5451
"storage = StorageFactory().create_storage(\n",
55-
" storage_type=storage_config[\"type\"], kwargs=storage_config\n",
52+
" storage_type=storage_config[\"type\"],\n",
53+
" kwargs=storage_config,\n",
5654
")"
5755
]
5856
},
5957
{
6058
"cell_type": "code",
61-
"execution_count": 69,
59+
"execution_count": 65,
6260
"metadata": {},
6361
"outputs": [],
6462
"source": [
@@ -69,7 +67,7 @@
6967
},
7068
{
7169
"cell_type": "code",
72-
"execution_count": 70,
70+
"execution_count": 66,
7371
"metadata": {},
7472
"outputs": [],
7573
"source": [
@@ -98,7 +96,7 @@
9896
},
9997
{
10098
"cell_type": "code",
101-
"execution_count": 71,
99+
"execution_count": 67,
102100
"metadata": {},
103101
"outputs": [],
104102
"source": [
@@ -134,7 +132,7 @@
134132
"if \"name\" in final_entities.columns:\n",
135133
" final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
136134
"remove_columns(\n",
137-
" final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
135+
" final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
138136
")\n",
139137
"\n",
140138
"# Final nodes uses community for joins, which is now an int everywhere\n",
@@ -168,6 +166,15 @@
168166
" final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
169167
"if \"parent\" not in final_communities.columns:\n",
170168
" final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
169+
"if \"entity_ids\" not in final_communities.columns:\n",
170+
" node_mapping = (\n",
171+
" final_nodes.loc[:, [\"community\", \"id\"]]\n",
172+
" .groupby(\"community\")\n",
173+
" .agg(entity_ids=(\"id\", list))\n",
174+
" )\n",
175+
" final_communities = final_communities.merge(\n",
176+
" node_mapping, on=\"community\", how=\"left\"\n",
177+
" )\n",
171178
"remove_columns(final_communities, [\"raw_community\"])\n",
172179
"\n",
173180
"# We need int for community and the human_readable_id copy for consistency\n",
@@ -197,7 +204,7 @@
197204
"source": [
198205
"from graphrag.cache.factory import CacheFactory\n",
199206
"from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
200-
"from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings\n",
207+
"from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n",
201208
"from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
202209
"\n",
203210
"# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",

0 commit comments

Comments
 (0)