Skip to content

Commit

Permalink
I know kung fu! Add learning functionality from repositories
Browse files Browse the repository at this point in the history
This commit introduces a new feature that allows the AI Code Bot to learn from a given repository. The bot can now clone a repository, load its documents, and store them in a local vector store for future use. This will enhance the bot's ability to provide contextually relevant suggestions and responses.

Additionally, this commit includes the necessary updates to the configuration and helper functions to support this new feature. The requirements have also been updated to include the necessary dependencies.

Lastly, a new test case has been added to ensure the correct parsing of GitHub URLs. 🧪
  • Loading branch information
TechNickAI committed Jul 17, 2023
1 parent d77df30 commit 7f5ae4a
Show file tree
Hide file tree
Showing 7 changed files with 230 additions and 16 deletions.
35 changes: 34 additions & 1 deletion aicodebot/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from aicodebot import version as aicodebot_version
from aicodebot.coder import CREATIVE_TEMPERATURE, DEFAULT_MAX_TOKENS, Coder
from aicodebot.config import get_config_file, read_config
from aicodebot.config import get_config_file, get_local_data_dir, read_config
from aicodebot.helpers import RichLiveCallbackHandler, create_and_write_file, exec_and_get_output, logger
from aicodebot.learn import load_documents_from_repo, store_documents
from aicodebot.prompts import DEFAULT_PERSONALITY, PERSONALITIES, generate_files_context, get_prompt
from langchain.chains import LLMChain
from langchain.memory import ConversationTokenBufferMemory
Expand Down Expand Up @@ -352,6 +353,36 @@ def fun_fact(verbose, response_token_size):
chain.run(f"programming and artificial intelligence in the year {year}")


@cli.command
@click.option("-v", "--verbose", count=True)
@click.option("-r", "--repo-url", help="The URL of the repository to learn from")
def learn(repo_url, verbose):
"""Learn new skills and gain additional knowledge from a repository"""
# Clone the supplied repo locally and walk through it, load it into a
# local vector store, and pre-query this vector store for the LLM to use a
# context for the prompt

setup_config()

owner, repo_name = Coder.parse_github_url(repo_url)

start_time = datetime.datetime.utcnow()

local_data_dir = get_local_data_dir()

Coder.clone_repo(repo_url, local_data_dir / "repos" / repo_name)
console.print("✅ Repo cloned.")

console.log("Loading documents")
vector_store_dir = local_data_dir / "vector_stores" / repo_name
documents = load_documents_from_repo(local_data_dir / "repos" / repo_name)
console.print("✅ Repo loaded and indexed.")

with console.status("Storing the repo in the vector store", spinner=DEFAULT_SPINNER):
store_documents(documents, vector_store_dir)
console.print(f"✅ Repo loaded and indexed in {datetime.datetime.utcnow() - start_time} seconds.")


@cli.command
@click.option("-c", "--commit", help="The commit hash to review (otherwise look at [un]staged changes).")
@click.option("-v", "--verbose", count=True)
Expand Down Expand Up @@ -478,6 +509,7 @@ def sidekick(request, verbose, response_token_size, files):
with Live(Markdown(""), auto_refresh=True) as live:
callback = RichLiveCallbackHandler(live, bot_style)
llm.callbacks = [callback] # a fresh callback handler for each question

chain.run({"task": human_input, "context": context})

if request:
Expand All @@ -497,6 +529,7 @@ def setup_config():
configure.callback(openai_api_key=os.getenv("OPENAI_API_KEY"), verbose=0)
sys.exit(0)
else:
os.environ["OPENAI_API_KEY"] = existing_config["openai_api_key"]
return existing_config


Expand Down
29 changes: 28 additions & 1 deletion aicodebot/coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from langchain.chat_models import ChatOpenAI
from openai.api_resources import engine
from pathlib import Path
import fnmatch, functools, openai, tiktoken
import fnmatch, functools, openai, re, subprocess, tiktoken

DEFAULT_MAX_TOKENS = 512
PRECISE_TEMPERATURE = 0.05
Expand All @@ -16,6 +16,18 @@ class Coder:
git, and the local file system.
"""

@staticmethod
def clone_repo(repo_url, repo_dir):
"""Clone a git repository to a directory."""
if Path(repo_dir).exists():
logger.info(f"Repo {repo_dir} already exists, updating it instead")
# Reset it first to make sure we don't have any local changes
subprocess.run(["git", "reset", "--hard"], cwd=repo_dir, check=True, stdout=subprocess.DEVNULL)
subprocess.run(["git", "pull"], cwd=repo_dir, check=True)
else:
logger.info(f"Cloning {repo_url} to {repo_dir}")
subprocess.run(["git", "clone", repo_url, repo_dir], check=True)

@classmethod
def generate_directory_structure(cls, path, ignore_patterns=None, use_gitignore=True, indent=0):
"""Generate a text representation of the directory structure of a path."""
Expand Down Expand Up @@ -198,3 +210,18 @@ def git_staged_files():
@staticmethod
def git_unstaged_files():
return exec_and_get_output(["git", "diff", "HEAD", "--name-only"]).splitlines()

@staticmethod
def parse_github_url(repo_url):
"""
Parse a GitHub URL and return the owner and repo name.
Returns: A tuple containing the owner and repo name.
"""
pattern = r"(?:https:\/\/github\.com\/|git@github\.com:)([^\/]+)\/([^\/]+?)(?:\.git)?$"
match = re.match(pattern, repo_url)

if not match:
raise ValueError("URL is not a valid GitHub URL")

owner, repo = match.groups()
return owner, repo
13 changes: 13 additions & 0 deletions aicodebot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
import os, yaml


def get_local_data_dir():
data_dir = Path(os.getenv("AICODEBOT_LOCAL_DATA_DIR", str(Path.home() / ".aicodebot_data")))
# Make the directory if it doesn't exist
if not data_dir.exists():
logger.debug(f"Creating local data directory {data_dir}")
data_dir.mkdir()
# Create the subdirectories
(data_dir / "repos").mkdir()
(data_dir / "vector_stores").mkdir()

return data_dir


def get_config_file():
return Path(os.getenv("AICODEBOT_CONFIG_FILE", str(Path.home() / ".aicodebot.yaml")))

Expand Down
125 changes: 125 additions & 0 deletions aicodebot/learn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from aicodebot.config import get_local_data_dir
from aicodebot.helpers import logger
from git import Repo
from langchain.document_loaders import GitLoader, NotebookLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from pathlib import Path

DEFAULT_EXCLUDE = [".csv", ".enex", ".json", ".jsonl"]


def load_documents_from_repo(repo_dir, exclude=DEFAULT_EXCLUDE):
"""Load a repo into the vector store."""

repo = Repo(repo_dir)
assert not repo.bare, f"Repo {repo_dir} does not appear to be a valid git repository"

# Check main first, then master, then give up
for branch in ["main", "master"]:
if branch in repo.heads:
default_branch = branch
break
else:
raise ValueError(f"Repo {repo_dir} does not have a main or master branch")

loader = GitLoader(repo_path=repo_dir, branch=default_branch)

documents = loader.load()
logger.info(f"Loaded {len(documents)} documents from {repo_dir}")

# Clean up
cleaned = []
logger.info("Cleaning up documents")
for document in documents:
content = document.page_content
if not content:
logger.debug(f"Skipping empty file {document.metadata['file_path']}")
continue

file_type = document.metadata["file_type"].lower()
if file_type in exclude:
logger.debug(f"Skipping excluded file {document.metadata['file_path']}")
continue

# Reload notebooks
if file_type == ".ipynb":
logger.debug(f"Reloading notebook {document.metadata['file_path']}")
new_document = NotebookLoader(repo_dir / document.metadata["file_path"]).load()[0]
# Use the original metadata, because it contains file_type
new_document.metadata = document.metadata
cleaned.append(new_document)
else:
cleaned.append(document)

return cleaned


def store_documents(documents, vector_store_dir):
"""Store documents in the vector store."""
vector_store_file = Path(vector_store_dir / "faiss_index")
embeddings = OpenAIEmbeddings()
if Path(vector_store_file).exists():
logger.info(f"Loading existing vector store {vector_store_file}")
return FAISS.load_local(vector_store_file, embeddings)

logger.info(f"Creating new vector store {vector_store_file}")

language_extension_map = {
".py": Language.PYTHON,
".ipynb": Language.PYTHON,
".js": Language.JS,
".ts": Language.JS,
".html": Language.HTML,
".md": Language.MARKDOWN,
".mdx": Language.MARKDOWN,
".go": Language.GO,
".java": Language.JAVA,
".c": Language.CPP,
".cpp": Language.CPP,
".php": Language.PHP,
".rb": Language.RUBY,
".xml": Language.HTML,
}

files = 0
chunks = []
for document in documents:
file_type = document.metadata["file_type"].lower()
files += 1

# Clean up
# Remove magic text that breaks processing
content = document.page_content.replace("<|end" + "of" + "text|>", "") # noqa: ISC003

if file_type in language_extension_map:
# Use a recursive splitter for code files
logger.debug(
f"Processing {document.metadata['file_path']} as {language_extension_map[file_type].value} code"
)
splitter = RecursiveCharacterTextSplitter.from_language(
language=language_extension_map[document.metadata["file_type"].lower()], chunk_size=50, chunk_overlap=0
)
else:
# TODO: Check if it's a text file
if file_type not in [".txt", ".md", ".yml", ".yaml"]:
logger.info(f"Processing {document.metadata['file_path']} as a text file")
splitter = CharacterTextSplitter(separator="\n", chunk_size=1_000, chunk_overlap=200)

chunks += splitter.create_documents([content])

logger.info(f"Storing {len(chunks)} chunks from {files} files in {vector_store_dir}")
vector_store = FAISS.from_documents(chunks, embeddings)
vector_store.save_local(vector_store_file)
return vector_store


def load_learned_repo(repo_name):
"""Load a vector store from a learned repo."""
vector_store_file = Path(get_local_data_dir() / "vector_stores" / repo_name / "faiss_index")
if not vector_store_file.exists():
raise ValueError(f"Vector store for {repo_name} does not exist. Please run `aicodebot learn $githuburl` first.")

embeddings = OpenAIEmbeddings()
return FAISS.load_local(vector_store_file, embeddings)
1 change: 1 addition & 0 deletions requirements/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

beautifulsoup4 # needed by langchain
click # command line interface helpers
faiss-cpu
GitPython
langchain
loguru
Expand Down
26 changes: 13 additions & 13 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile
# pip-compile requirements.in
#
aiohttp==3.8.4
# via
Expand All @@ -15,37 +15,37 @@ async-timeout==4.0.2
attrs==23.1.0
# via aiohttp
beautifulsoup4==4.12.2
# via -r requirements/requirements.in
# via -r requirements.in
certifi==2023.5.7
# via requests
charset-normalizer==3.1.0
# via
# aiohttp
# requests
click==8.1.4
# via -r requirements/requirements.in
# via -r requirements.in
dataclasses-json==0.5.8
# via langchain
faiss-cpu==1.7.4
# via -r requirements.in
frozenlist==1.3.3
# via
# aiohttp
# aiosignal
gitdb==4.0.10
# via gitpython
gitpython==3.1.32
# via -r requirements/requirements.in
greenlet==2.0.2
# via sqlalchemy
# via -r requirements.in
idna==3.4
# via
# requests
# yarl
langchain==0.0.231
# via -r requirements/requirements.in
# via -r requirements.in
langchainplus-sdk==0.0.20
# via langchain
loguru==0.7.0
# via -r requirements/requirements.in
# via -r requirements.in
markdown-it-py==3.0.0
# via rich
marshmallow==3.19.0
Expand All @@ -69,13 +69,13 @@ numpy==1.25.0
# langchain
# numexpr
openai==0.27.8
# via -r requirements/requirements.in
# via -r requirements.in
openapi-schema-pydantic==1.2.4
# via langchain
packaging==23.1
# via marshmallow
prompt-toolkit==3.0.39
# via -r requirements/requirements.in
# via -r requirements.in
pydantic==1.10.9
# via
# langchain
Expand All @@ -85,7 +85,7 @@ pygments==2.15.1
# via rich
pyyaml==6.0
# via
# -r requirements/requirements.in
# -r requirements.in
# langchain
regex==2023.6.3
# via tiktoken
Expand All @@ -96,7 +96,7 @@ requests==2.31.0
# openai
# tiktoken
rich==13.4.2
# via -r requirements/requirements.in
# via -r requirements.in
smmap==5.0.0
# via gitdb
soupsieve==2.4.1
Expand All @@ -108,7 +108,7 @@ tenacity==8.2.2
# langchain
# langchainplus-sdk
tiktoken==0.4.0
# via -r requirements/requirements.in
# via -r requirements.in
tqdm==4.65.0
# via openai
typing-extensions==4.6.3
Expand Down
17 changes: 16 additions & 1 deletion tests/test_coder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from aicodebot.coder import Coder
from aicodebot.helpers import create_and_write_file
import os
import os, pytest


def test_generate_directory_structure(tmp_path):
Expand Down Expand Up @@ -113,3 +113,18 @@ def test_git_diff_context(temp_git_repo):
commit = temp_git_repo.head.commit.hexsha
diff = Coder.git_diff_context(commit)
assert "renamedfile.txt" in diff

def test_parse_github_url():
# Test with https URL
owner, repo = Coder.parse_github_url("https://github.com/owner/repo.git")
assert owner == "owner"
assert repo == "repo"

# Test with git URL
owner, repo = Coder.parse_github_url("[email protected]:owner/repo.git")
assert owner == "owner"
assert repo == "repo"

# Test with invalid URL
with pytest.raises(ValueError):
Coder.parse_github_url("not a valid url")

1 comment on commit 7f5ae4a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 AICodeBot Review Comments:

The code changes are generally well-structured and logical. However, there are a few areas that could be improved:

  1. In aicodebot/coder.py, the clone_repo method does not handle potential exceptions from the subprocess.run calls. It would be logical to add error handling to ensure the application does not crash if the repository cloning fails.

  2. In aicodebot/learn.py, the load_documents_from_repo method assumes the existence of either a 'main' or 'master' branch. This may not always be the case. It would be more robust to handle repositories with different default branch names.

  3. The store_documents method in aicodebot/learn.py could benefit from more granular error handling and logging. This would make it easier to diagnose issues if the document storage process fails.

  4. The test test_parse_github_url in tests/test_coder.py only tests with a '.git' suffix. It would be beneficial to also test URLs without this suffix, as GitHub supports both formats.

  5. The commit message contains an emoji, which is not in line with conventional commit message standards. It would be more appropriate to remove this.

Please consider these points to improve the robustness and maintainability of the code.

Code review automatically created with AICodeBot

Please sign in to comment.