|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 66, |
| 5 | + "execution_count": 62, |
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
|
14 | 14 | "cell_type": "markdown",
|
15 | 15 | "metadata": {},
|
16 | 16 | "source": [
|
17 |
| - "## Index Migration\n", |
| 17 | + "## Index Migration (pre-v1 to v1)\n", |
18 | 18 | "\n",
|
19 |
| - "This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n", |
| 19 | + "This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n", |
20 | 20 | "\n",
|
21 | 21 | "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
|
22 | 22 | "\n",
|
|
25 | 25 | },
|
26 | 26 | {
|
27 | 27 | "cell_type": "code",
|
28 |
| - "execution_count": 67, |
| 28 | + "execution_count": 63, |
29 | 29 | "metadata": {},
|
30 | 30 | "outputs": [],
|
31 | 31 | "source": [
|
32 |
| - "# This is the directory that has your settings.yml\n", |
| 32 | + "# This is the directory that has your settings.yaml\n", |
33 | 33 | "# NOTE: much older indexes may have been output with a timestamped directory\n",
|
34 |
| - "# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n", |
35 |
| - "PROJECT_DIRECTORY = \"<your project directory>\"" |
| 34 | + "# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n", |
| 35 | + "PROJECT_DIRECTORY = \"<your project directory\"" |
36 | 36 | ]
|
37 | 37 | },
|
38 | 38 | {
|
39 | 39 | "cell_type": "code",
|
40 |
| - "execution_count": null, |
| 40 | + "execution_count": 64, |
41 | 41 | "metadata": {},
|
42 | 42 | "outputs": [],
|
43 | 43 | "source": [
|
44 | 44 | "from pathlib import Path\n",
|
45 | 45 | "\n",
|
46 | 46 | "from graphrag.config.load_config import load_config\n",
|
47 |
| - "from graphrag.config.resolve_path import resolve_paths\n", |
48 | 47 | "from graphrag.storage.factory import StorageFactory\n",
|
49 | 48 | "\n",
|
50 |
| - "# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n", |
51 | 49 | "config = load_config(Path(PROJECT_DIRECTORY))\n",
|
52 |
| - "resolve_paths(config)\n", |
53 |
| - "storage_config = config.storage.model_dump() # type: ignore\n", |
| 50 | + "storage_config = config.output.model_dump()\n", |
54 | 51 | "storage = StorageFactory().create_storage(\n",
|
55 |
| - " storage_type=storage_config[\"type\"], kwargs=storage_config\n", |
| 52 | + " storage_type=storage_config[\"type\"],\n", |
| 53 | + " kwargs=storage_config,\n", |
56 | 54 | ")"
|
57 | 55 | ]
|
58 | 56 | },
|
59 | 57 | {
|
60 | 58 | "cell_type": "code",
|
61 |
| - "execution_count": 69, |
| 59 | + "execution_count": 65, |
62 | 60 | "metadata": {},
|
63 | 61 | "outputs": [],
|
64 | 62 | "source": [
|
|
69 | 67 | },
|
70 | 68 | {
|
71 | 69 | "cell_type": "code",
|
72 |
| - "execution_count": 70, |
| 70 | + "execution_count": 66, |
73 | 71 | "metadata": {},
|
74 | 72 | "outputs": [],
|
75 | 73 | "source": [
|
|
98 | 96 | },
|
99 | 97 | {
|
100 | 98 | "cell_type": "code",
|
101 |
| - "execution_count": 71, |
| 99 | + "execution_count": 67, |
102 | 100 | "metadata": {},
|
103 | 101 | "outputs": [],
|
104 | 102 | "source": [
|
|
134 | 132 | "if \"name\" in final_entities.columns:\n",
|
135 | 133 | " final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
|
136 | 134 | "remove_columns(\n",
|
137 |
| - " final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n", |
| 135 | + " final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n", |
138 | 136 | ")\n",
|
139 | 137 | "\n",
|
140 | 138 | "# Final nodes uses community for joins, which is now an int everywhere\n",
|
|
168 | 166 | " final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
|
169 | 167 | "if \"parent\" not in final_communities.columns:\n",
|
170 | 168 | " final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
|
| 169 | + "if \"entity_ids\" not in final_communities.columns:\n", |
| 170 | + " node_mapping = (\n", |
| 171 | + " final_nodes.loc[:, [\"community\", \"id\"]]\n", |
| 172 | + " .groupby(\"community\")\n", |
| 173 | + " .agg(entity_ids=(\"id\", list))\n", |
| 174 | + " )\n", |
| 175 | + " final_communities = final_communities.merge(\n", |
| 176 | + " node_mapping, on=\"community\", how=\"left\"\n", |
| 177 | + " )\n", |
171 | 178 | "remove_columns(final_communities, [\"raw_community\"])\n",
|
172 | 179 | "\n",
|
173 | 180 | "# We need int for community and the human_readable_id copy for consistency\n",
|
|
197 | 204 | "source": [
|
198 | 205 | "from graphrag.cache.factory import CacheFactory\n",
|
199 | 206 | "from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
|
200 |
| - "from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings\n", |
| 207 | + "from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n", |
201 | 208 | "from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
|
202 | 209 | "\n",
|
203 | 210 | "# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",
|
|
0 commit comments