diff --git a/docs/docs/how_to/graph_constructing.ipynb b/docs/docs/how_to/graph_constructing.ipynb index e9b6a014177c75..79b9e1463f4a2f 100644 --- a/docs/docs/how_to/graph_constructing.ipynb +++ b/docs/docs/how_to/graph_constructing.ipynb @@ -44,6 +44,9 @@ "name": "stdout", "output_type": "stream", "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -105,7 +108,7 @@ "os.environ[\"NEO4J_USERNAME\"] = \"neo4j\"\n", "os.environ[\"NEO4J_PASSWORD\"] = \"password\"\n", "\n", - "graph = Neo4jGraph()" + "graph = Neo4jGraph(refresh_schema=False)" ] }, { @@ -149,8 +152,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n", - "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='MARRIED'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='PROFESSOR')]\n" + "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n", + "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='MARRIED', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='PROFESSOR', properties={})]\n" ] } ], @@ -191,8 +194,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n", - "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n" + "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n", + "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n" ] } ], @@ -209,6 +212,44 @@ "print(f\"Relationships:{graph_documents_filtered[0].relationships}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To define the graph schema more precisely, consider using a three-tuple approach for relationships. In this approach, each tuple consists of three elements: the source node, the relationship type, and the target node." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n", + "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n" + ] + } + ], + "source": [ + "allowed_relationships = [\n", + " (\"Person\", \"SPOUSE\", \"Person\"),\n", + " (\"Person\", \"NATIONALITY\", \"Country\"),\n", + " (\"Person\", \"WORKED_AT\", \"Organization\"),\n", + "]\n", + "\n", + "llm_transformer_tuple = LLMGraphTransformer(\n", + " llm=llm,\n", + " allowed_nodes=[\"Person\", \"Country\", \"Organization\"],\n", + " allowed_relationships=allowed_relationships,\n", + ")\n", + "llm_transformer_tuple = llm_transformer_filtered.convert_to_graph_documents(documents)\n", + "print(f\"Nodes:{graph_documents_filtered[0].nodes}\")\n", + "print(f\"Relationships:{graph_documents_filtered[0].relationships}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -229,15 +270,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n", - "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n" + "Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={}), Node(id='Poland', type='Country', properties={}), Node(id='France', type='Country', properties={})]\n", + "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Poland', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='France', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n" ] } ], @@ -264,12 +305,71 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "graph.add_graph_documents(graph_documents_props)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most graph databases support indexes to optimize data import and retrieval. Since we might not know all the node labels in advance, we can handle this by adding a secondary base label to each node using the `baseEntityLabel` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "graph.add_graph_documents(graph_documents, baseEntityLabel=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Results will look like:\n", + "\n", + "![graph_construction3.png](../../static/img/graph_construction3.png)\n", + "\n", + "The final option is to also import the source documents for the extracted nodes and relationships. This approach lets us track which documents each entity appeared in." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "graph.add_graph_documents(graph_documents, include_source=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Graph will have the following structure:\n", + "\n", + "![graph_construction4.png](../../static/img/graph_construction4.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this visualization, the source document is highlighted in blue, with all entities extracted from it connected by `MENTIONS` relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -288,7 +388,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/docs/static/img/graph_construction3.png b/docs/static/img/graph_construction3.png new file mode 100644 index 00000000000000..3d89b1162e6838 Binary files /dev/null and b/docs/static/img/graph_construction3.png differ diff --git a/docs/static/img/graph_construction4.png b/docs/static/img/graph_construction4.png new file mode 100644 index 00000000000000..4b892cd652b46d Binary files /dev/null and b/docs/static/img/graph_construction4.png differ