diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index fc208a71ef21c..d9c8177cd6a07 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -29,6 +29,7 @@ gliner>=0.2.7 google-cloud-documentai>=2.20.1,<3 gql>=3.4.1,<4 gradientai>=1.4.0,<2 +graphviz>=0.20.3,<0.21 hdbcli>=2.19.21,<3 hologres-vector==0.0.6 html2text>=2020.1.16 diff --git a/libs/community/langchain_community/graph_vectorstores/visualize.py b/libs/community/langchain_community/graph_vectorstores/visualize.py new file mode 100644 index 0000000000000..7dd0f1dcfcf20 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/visualize.py @@ -0,0 +1,122 @@ +import re +from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple + +from langchain_core._api import beta +from langchain_core.documents import Document + +from langchain_community.graph_vectorstores.links import get_links + +if TYPE_CHECKING: + import graphviz + + +def _escape_id(id: str) -> str: + return id.replace(":", "_") + + +_EDGE_DIRECTION = { + "in": "back", + "out": "forward", + "bidir": "both", +} + +_WORD_RE = re.compile("\s*\S+") + + +def _split_prefix(s: str, max_chars: int = 50) -> str: + words = _WORD_RE.finditer(s) + + split = min(len(s), max_chars) + for word in words: + if word.end(0) > max_chars: + break + split = word.end(0) + + if split == len(s): + return s + else: + return f"{s[0:split]}..." + + +@beta() +def render_graphviz( + documents: Iterable[Document], + engine: Optional[str] = None, + node_color: Optional[str] = None, + node_colors: Optional[Dict[str, Optional[str]]] = None, + skip_tags: Iterable[Tuple[str, str]] = (), +) -> "graphviz.Digraph": + """Render a collection of GraphVectorStore documents to GraphViz format. + + Args: + documents: The documents to render. + engine: GraphViz layout engine to use. `None` uses the default. + node_color: Default node color. + node_colors: Dictionary specifying colors of specific nodes. Useful for + emphasizing nodes that were selected by MMR, or differ from other + results. + skip_tags: Set of tags to skip when rendering the graph. Specified as + tuples containing the kind and tag. + + Returns: + The "graphviz.Digraph" representing the nodes. May be printed to source, + or rendered using `dot`. + + Note: + To render the generated DOT source code, you also need to install Graphviz_ + (`download page `_, + `archived versions `_, + `installation procedure for Windows `_). + """ + if node_colors is None: + node_colors = {} + + try: + import graphviz + except (ImportError, ModuleNotFoundError): + raise ImportError( + "Could not import graphviz python package. " + "Please install it with `pip install graphviz`." + ) + + graph = graphviz.Digraph(engine=engine) + graph.attr(rankdir="LR") + graph.attr("node", style="filled") + + skip_tags = set(skip_tags) + tags: dict[Tuple[str, str], str] = {} + + for document in documents: + id = document.id + if id is None: + raise ValueError(f"Illegal graph document without ID: {document}") + escaped_id = _escape_id(id) + color = node_colors[id] if id in node_colors else node_color + + node_label = "\n".join( + [ + graphviz.escape(id), + graphviz.escape(_split_prefix(document.page_content)), + ] + ) + graph.node( + escaped_id, + label=node_label, + shape="note", + fillcolor=color, + tooltip=graphviz.escape(document.page_content), + ) + + for link in get_links(document): + tag_key = (link.kind, link.tag) + if tag_key in skip_tags: + continue + + tag_id = tags.get(tag_key) + if tag_id is None: + tag_id = f"tag_{len(tags)}" + tags[tag_key] = tag_id + graph.node(tag_id, label=graphviz.escape(f"{link.kind}:{link.tag}")) + + graph.edge(escaped_id, tag_id, dir=_EDGE_DIRECTION[link.direction]) + return graph diff --git a/libs/community/tests/unit_tests/graph_vectorstores/test_visualize.py b/libs/community/tests/unit_tests/graph_vectorstores/test_visualize.py new file mode 100644 index 0000000000000..89615c0bfe6b9 --- /dev/null +++ b/libs/community/tests/unit_tests/graph_vectorstores/test_visualize.py @@ -0,0 +1,113 @@ +import pytest +from langchain_core.documents import Document + +from langchain_community.graph_vectorstores.links import METADATA_LINKS_KEY, Link +from langchain_community.graph_vectorstores.visualize import render_graphviz + + +@pytest.mark.requires("graphviz") +def test_visualize_simple_graph() -> None: + doc1 = Document( + id="a", + page_content="some content", + metadata={ + METADATA_LINKS_KEY: [ + Link.incoming("href", "a"), + Link.bidir("kw", "foo"), + ] + }, + ) + doc2 = Document( + id="b", + page_content="", + metadata={ + METADATA_LINKS_KEY: [ + Link.incoming("href", "b"), + Link.outgoing("href", "a"), + Link.bidir("kw", "foo"), + Link.bidir("kw", "bar"), + ] + }, + ) + + assert render_graphviz([doc1, doc2]).source == ( + "digraph {\n" + "\trankdir=LR\n" + "\tnode [style=filled]\n" + '\ta [label="a\nsome content" shape=note tooltip="some content"]\n' + '\ttag_0 [label="href:a"]\n' + "\ta -> tag_0 [dir=back]\n" + '\ttag_1 [label="kw:foo"]\n' + "\ta -> tag_1 [dir=both]\n" + '\tb [label="b\n" ' + 'shape=note tooltip=""]\n' + '\ttag_2 [label="href:b"]\n' + "\tb -> tag_2 [dir=back]\n" + "\tb -> tag_0 [dir=forward]\n" + "\tb -> tag_1 [dir=both]\n" + '\ttag_3 [label="kw:bar"]\n' + "\tb -> tag_3 [dir=both]\n" + "}\n" + ) + + assert render_graphviz([doc1, doc2], engine="fdp").engine == "fdp" + + assert render_graphviz([doc1, doc2], node_colors={"a": "gold"}).source == ( + "digraph {\n" + "\trankdir=LR\n" + "\tnode [style=filled]\n" + '\ta [label="a\nsome content" fillcolor=gold ' + 'shape=note tooltip="some content"]\n' + '\ttag_0 [label="href:a"]\n' + "\ta -> tag_0 [dir=back]\n" + '\ttag_1 [label="kw:foo"]\n' + "\ta -> tag_1 [dir=both]\n" + '\tb [label="b\n" ' + 'shape=note tooltip=""]\n' + '\ttag_2 [label="href:b"]\n' + "\tb -> tag_2 [dir=back]\n" + "\tb -> tag_0 [dir=forward]\n" + "\tb -> tag_1 [dir=both]\n" + '\ttag_3 [label="kw:bar"]\n' + "\tb -> tag_3 [dir=both]\n" + "}\n" + ) + + assert render_graphviz( + [doc1, doc2], node_color="gold", node_colors={"a": None} + ).source == ( + "digraph {\n" + "\trankdir=LR\n" + "\tnode [style=filled]\n" + '\ta [label="a\nsome content" shape=note tooltip="some content"]\n' + '\ttag_0 [label="href:a"]\n' + "\ta -> tag_0 [dir=back]\n" + '\ttag_1 [label="kw:foo"]\n' + "\ta -> tag_1 [dir=both]\n" + '\tb [label="b\n" fillcolor=gold ' + 'shape=note tooltip=""]\n' + '\ttag_2 [label="href:b"]\n' + "\tb -> tag_2 [dir=back]\n" + "\tb -> tag_0 [dir=forward]\n" + "\tb -> tag_1 [dir=both]\n" + '\ttag_3 [label="kw:bar"]\n' + "\tb -> tag_3 [dir=both]\n" + "}\n" + ) + + assert render_graphviz([doc1, doc2], skip_tags=[("kw", "foo")]).source == ( + "digraph {\n" + "\trankdir=LR\n" + "\tnode [style=filled]\n" + '\ta [label="a\nsome content" shape=note tooltip="some content"]\n' + '\ttag_0 [label="href:a"]\n' + "\ta -> tag_0 [dir=back]\n" + '\tb [label="b\n" ' + 'shape=note tooltip=""]\n' + '\ttag_1 [label="href:b"]\n' + "\tb -> tag_1 [dir=back]\n" + "\tb -> tag_0 [dir=forward]\n" + '\ttag_2 [label="kw:bar"]\n' + "\tb -> tag_2 [dir=both]\n" + "}\n" + )