-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
I know kung fu! Add learning functionality from repositories
This commit introduces a new feature that allows the AI Code Bot to learn from a given repository. The bot can now clone a repository, load its documents, and store them in a local vector store for future use. This will enhance the bot's ability to provide contextually relevant suggestions and responses. Additionally, this commit includes the necessary updates to the configuration and helper functions to support this new feature. The requirements have also been updated to include the necessary dependencies. Lastly, a new test case has been added to ensure the correct parsing of GitHub URLs. 🧪
- Loading branch information
1 parent
d77df30
commit 7f5ae4a
Showing
7 changed files
with
230 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
from aicodebot.config import get_local_data_dir | ||
from aicodebot.helpers import logger | ||
from git import Repo | ||
from langchain.document_loaders import GitLoader, NotebookLoader | ||
from langchain.embeddings.openai import OpenAIEmbeddings | ||
from langchain.text_splitter import CharacterTextSplitter, Language, RecursiveCharacterTextSplitter | ||
from langchain.vectorstores import FAISS | ||
from pathlib import Path | ||
|
||
DEFAULT_EXCLUDE = [".csv", ".enex", ".json", ".jsonl"] | ||
|
||
|
||
def load_documents_from_repo(repo_dir, exclude=DEFAULT_EXCLUDE): | ||
"""Load a repo into the vector store.""" | ||
|
||
repo = Repo(repo_dir) | ||
assert not repo.bare, f"Repo {repo_dir} does not appear to be a valid git repository" | ||
|
||
# Check main first, then master, then give up | ||
for branch in ["main", "master"]: | ||
if branch in repo.heads: | ||
default_branch = branch | ||
break | ||
else: | ||
raise ValueError(f"Repo {repo_dir} does not have a main or master branch") | ||
|
||
loader = GitLoader(repo_path=repo_dir, branch=default_branch) | ||
|
||
documents = loader.load() | ||
logger.info(f"Loaded {len(documents)} documents from {repo_dir}") | ||
|
||
# Clean up | ||
cleaned = [] | ||
logger.info("Cleaning up documents") | ||
for document in documents: | ||
content = document.page_content | ||
if not content: | ||
logger.debug(f"Skipping empty file {document.metadata['file_path']}") | ||
continue | ||
|
||
file_type = document.metadata["file_type"].lower() | ||
if file_type in exclude: | ||
logger.debug(f"Skipping excluded file {document.metadata['file_path']}") | ||
continue | ||
|
||
# Reload notebooks | ||
if file_type == ".ipynb": | ||
logger.debug(f"Reloading notebook {document.metadata['file_path']}") | ||
new_document = NotebookLoader(repo_dir / document.metadata["file_path"]).load()[0] | ||
# Use the original metadata, because it contains file_type | ||
new_document.metadata = document.metadata | ||
cleaned.append(new_document) | ||
else: | ||
cleaned.append(document) | ||
|
||
return cleaned | ||
|
||
|
||
def store_documents(documents, vector_store_dir): | ||
"""Store documents in the vector store.""" | ||
vector_store_file = Path(vector_store_dir / "faiss_index") | ||
embeddings = OpenAIEmbeddings() | ||
if Path(vector_store_file).exists(): | ||
logger.info(f"Loading existing vector store {vector_store_file}") | ||
return FAISS.load_local(vector_store_file, embeddings) | ||
|
||
logger.info(f"Creating new vector store {vector_store_file}") | ||
|
||
language_extension_map = { | ||
".py": Language.PYTHON, | ||
".ipynb": Language.PYTHON, | ||
".js": Language.JS, | ||
".ts": Language.JS, | ||
".html": Language.HTML, | ||
".md": Language.MARKDOWN, | ||
".mdx": Language.MARKDOWN, | ||
".go": Language.GO, | ||
".java": Language.JAVA, | ||
".c": Language.CPP, | ||
".cpp": Language.CPP, | ||
".php": Language.PHP, | ||
".rb": Language.RUBY, | ||
".xml": Language.HTML, | ||
} | ||
|
||
files = 0 | ||
chunks = [] | ||
for document in documents: | ||
file_type = document.metadata["file_type"].lower() | ||
files += 1 | ||
|
||
# Clean up | ||
# Remove magic text that breaks processing | ||
content = document.page_content.replace("<|end" + "of" + "text|>", "") # noqa: ISC003 | ||
|
||
if file_type in language_extension_map: | ||
# Use a recursive splitter for code files | ||
logger.debug( | ||
f"Processing {document.metadata['file_path']} as {language_extension_map[file_type].value} code" | ||
) | ||
splitter = RecursiveCharacterTextSplitter.from_language( | ||
language=language_extension_map[document.metadata["file_type"].lower()], chunk_size=50, chunk_overlap=0 | ||
) | ||
else: | ||
# TODO: Check if it's a text file | ||
if file_type not in [".txt", ".md", ".yml", ".yaml"]: | ||
logger.info(f"Processing {document.metadata['file_path']} as a text file") | ||
splitter = CharacterTextSplitter(separator="\n", chunk_size=1_000, chunk_overlap=200) | ||
|
||
chunks += splitter.create_documents([content]) | ||
|
||
logger.info(f"Storing {len(chunks)} chunks from {files} files in {vector_store_dir}") | ||
vector_store = FAISS.from_documents(chunks, embeddings) | ||
vector_store.save_local(vector_store_file) | ||
return vector_store | ||
|
||
|
||
def load_learned_repo(repo_name): | ||
"""Load a vector store from a learned repo.""" | ||
vector_store_file = Path(get_local_data_dir() / "vector_stores" / repo_name / "faiss_index") | ||
if not vector_store_file.exists(): | ||
raise ValueError(f"Vector store for {repo_name} does not exist. Please run `aicodebot learn $githuburl` first.") | ||
|
||
embeddings = OpenAIEmbeddings() | ||
return FAISS.load_local(vector_store_file, embeddings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from aicodebot.coder import Coder | ||
from aicodebot.helpers import create_and_write_file | ||
import os | ||
import os, pytest | ||
|
||
|
||
def test_generate_directory_structure(tmp_path): | ||
|
@@ -113,3 +113,18 @@ def test_git_diff_context(temp_git_repo): | |
commit = temp_git_repo.head.commit.hexsha | ||
diff = Coder.git_diff_context(commit) | ||
assert "renamedfile.txt" in diff | ||
|
||
def test_parse_github_url(): | ||
# Test with https URL | ||
owner, repo = Coder.parse_github_url("https://github.com/owner/repo.git") | ||
assert owner == "owner" | ||
assert repo == "repo" | ||
|
||
# Test with git URL | ||
owner, repo = Coder.parse_github_url("[email protected]:owner/repo.git") | ||
assert owner == "owner" | ||
assert repo == "repo" | ||
|
||
# Test with invalid URL | ||
with pytest.raises(ValueError): | ||
Coder.parse_github_url("not a valid url") |
7f5ae4a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 AICodeBot Review Comments:
The code changes are generally well-structured and logical. However, there are a few areas that could be improved:
In
aicodebot/coder.py
, theclone_repo
method does not handle potential exceptions from thesubprocess.run
calls. It would be logical to add error handling to ensure the application does not crash if the repository cloning fails.In
aicodebot/learn.py
, theload_documents_from_repo
method assumes the existence of either a 'main' or 'master' branch. This may not always be the case. It would be more robust to handle repositories with different default branch names.The
store_documents
method inaicodebot/learn.py
could benefit from more granular error handling and logging. This would make it easier to diagnose issues if the document storage process fails.The test
test_parse_github_url
intests/test_coder.py
only tests with a '.git' suffix. It would be beneficial to also test URLs without this suffix, as GitHub supports both formats.The commit message contains an emoji, which is not in line with conventional commit message standards. It would be more appropriate to remove this.
Please consider these points to improve the robustness and maintainability of the code.
Code review automatically created with AICodeBot