Skip to content

Commit 430116b

Browse files
committed
more structuring & data upload pipeline
1 parent bd594e0 commit 430116b

27 files changed

+273
-27
lines changed

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ python app.py
4242

4343
### Phase 1: Setup and Data Collection
4444

45-
- [ ] Set up the project repository with a basic directory structure.
46-
- [ ] Create a virtual environment and install necessary dependencies.
47-
- [ ] Implement data collection mechanisms for text and image data.
45+
- [x] Set up the project repository with a basic directory structure.
46+
- [x] Create a virtual environment and install necessary dependencies.
47+
- [x] Implement data collection mechanisms for text and image data.
4848
- [ ] Preprocess and clean the collected data for further processing.
4949

5050
### Phase 2: LLM Architecture Integration

flock/__init__.py

Whitespace-only changes.
136 Bytes
Binary file not shown.

flock/config/.env

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# MongoDB configuration
2+
MONGODB_URI=mongodb://localhost:27017/
3+
MONGODB_DB_NAME=flock_database
4+
5+
# ChromaDB configuration
6+
CHROMADB_URI=your_chromadb_uri
7+
CHROMADB_DB_NAME=your_chromadb_database_name
8+
CHROMADB_USERNAME=your_chromadb_username
9+
CHROMADB_PASSWORD=your_chromadb_password

flock/create_symlinks.py

-20
This file was deleted.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
def lowercase(string):
2+
return string.lower()

flock/embedding_pipeline/tests/test_main.py

Whitespace-only changes.

flock/setup_project_path.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import sys
2+
import os
3+
4+
# Add the project root directory to the Python path
5+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__))))
6+
7+
# Import any common utilities or setup here, if needed
8+
# ...
4.9 KB
Binary file not shown.

flock/src/database/__init__.py

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from pymongo import MongoClient
2+
3+
class MongoDatabase(object):
4+
def __init__(self, uri, db_name):
5+
self.client = MongoClient(uri)
6+
self.db = self.clinet[db_name]
7+
8+
def get_collection(self, collection_name):
9+
return self.db[collection_name]
10+
11+
def find_documents(self, collection_name, filter_dict = None, limit=None, use_generator=False):
12+
"""
13+
Find documents in a collection based on a filter.
14+
15+
:param collection_name: The name of the collection to search.
16+
:param filter_dict: A dictionary containing filters (optional).
17+
:param limit: Maximum number of documents to retrieve (optional).
18+
:param use_generator: Whether to use a generator (default is True).
19+
:return: A generator yielding matching documents (if use_generator is True), otherwise a list.
20+
"""
21+
collection = self.get_collection(collection_name)
22+
try:
23+
if filter_dict is None:
24+
cursor = collection.find()
25+
26+
else:
27+
cursor = collection.find(filter_dict)
28+
29+
if limit:
30+
cursor.limit(limit)
31+
32+
if use_generator:
33+
for document in cursor:
34+
yield document
35+
36+
else:
37+
return list(cursor)
38+
except Exception as e:
39+
return f"Error occured: {str(e)}"

flock/src/embedding_pipeline/__init__.py

-1
This file was deleted.

flock/src/main.py

+85-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,85 @@
1-
# orchestrates the mebedding pipeline
2-
# loads data, preprocesses it, generates/embeds text
1+
import uvicorn
2+
from fastapi import FastAPI, HTTPException, File, UploadFile
3+
from typing import List, Dict
4+
import requests, json
5+
import subprocess
6+
7+
import magic
8+
9+
from pipelineManager.pipelineValidator import PipelineValidator
10+
11+
pipeline_validator = PipelineValidator()
12+
app = FastAPI()
13+
14+
@app.get('/')
15+
def root():
16+
return {"message": "App is running! :)"}
17+
18+
@app.post("/create-pipeline/")
19+
def create_pipeline_config(data: List[str], pipeline_config: dict):
20+
try:
21+
if pipeline_validator.validate(pipeline_config):
22+
23+
# Save pipeline config to a temporary YAML file -- TBD -- Save in MongoDB
24+
with open("pipelines/temp_pipeline_config.yml", "w") as pipeline_json_file_path:
25+
json.dumps(pipeline_config, pipeline_json_file_path)
26+
27+
# Data processing data_processing.py script with subprocess
28+
subprocess.run(["python", "data_processing.py", "temp_pipeline_config.yml"])
29+
30+
return {"message": "Data processing completed."}
31+
32+
except Exception as e:
33+
return {"error": str(e)}
34+
35+
@app.post("/upload-from-link/")
36+
def upload_from_link(link: str):
37+
try:
38+
response = requests.get(link)
39+
if response.status_code == 200:
40+
content_type = response.headers.get("content-type")
41+
if "pdf" in content_type.lower():
42+
# Process PDF content
43+
pdf_content = response.content
44+
# Process the PDF content here
45+
return {"message": "PDF content received and processed."}
46+
elif "text" in content_type.lower() or "html" in content_type.lower():
47+
# Process text or HTML content
48+
text_content = response.content.decode("utf-8")
49+
# Process the text or HTML content here
50+
return {"message": "Text or HTML content received and processed."}
51+
else:
52+
return {"error": "Unsupported content type."}
53+
else:
54+
return {"error": "Unable to fetch content from link."}
55+
except requests.exceptions.RequestException as e:
56+
return {"error": "Failed to fetch content from link."}
57+
58+
@app.post("/upload-from-directory/")
59+
def upload_from_directory(files: List[UploadFile] = File(...)):
60+
try:
61+
for file in files:
62+
content_type = magic.Magic()
63+
file_type = content_type.from_buffer(file.file.read())
64+
65+
if "pdf" in file_type.lower():
66+
# Process PDF file
67+
pdf_content = file.file.read()
68+
69+
elif "text" in file_type.lower() or "html" in file_type.lower():
70+
# Process text or HTML file
71+
text_content = file.file.read().decode("utf-8")
72+
# Process the text or HTML content here
73+
74+
else:
75+
return {"error" : f"Unsupported file type: {file_type}"}
76+
77+
78+
return {"message": f"Uploaded and processed {len(files)} files from directory."}
79+
80+
81+
except Exception as e:
82+
return {"error": str(e)}
83+
84+
if __name__ == "__main__":
85+
uvicorn.run(app, host="0.0.0.0", port=8000)
Binary file not shown.
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
class Parameter:
2+
def __init__(self):
3+
self.parameter_dict = {
4+
"TRANSFORMERS.MODEL_NAME": {
5+
"name" : "TRANSFORMER_MODEL.MODEL_NAME",
6+
"description" : "Transformer model name",
7+
"type" : "list",
8+
"range" : ['llama2-7b', 'llama2-13b', 'wizardlm', 'bloom','falcon'],
9+
"default": "llama2-7b"
10+
},
11+
"EMBEDDING.MODEL_NAME" : {
12+
"name" : "EMBEDDING.MODEL_NAME",
13+
"description" : "Emebedding model name",
14+
"type" : "list",
15+
"range" : ['hkunlp/instructor-base'],
16+
"default": "hkunlp/instructor-base"
17+
},
18+
"OUTPUT.TRANSFORMERS.TEMPERATURE" : {
19+
"name" : "OUTPUT_PARAMETERS.TEMPERATURE",
20+
"description" : "Higher temperature values (e.g., 0.8 to 1.0) lead to more diverse and random output. Lower values (e.g., 0.2 to 0.5) make the output more focused and deterministic.",
21+
"type" : "percentage",
22+
"range" : [0,1],
23+
"default": 0.0
24+
},
25+
"OUTPUT.TRANSFORMERS.TOP_P" : {
26+
"name" : "OUTPUT_PARAMETERS.TOP_P",
27+
"description" : "Higher top_p values (e.g., 0.8 to 1.0) allow more words to be considered, potentially resulting in more varied output. Lower values (e.g., 0.1 to 0.5) limit the set of words and may produce more controlled responses.",
28+
"type" : "percentage",
29+
"range" : [0, 1],
30+
"default" : 0.95
31+
},
32+
33+
"OUTPUT.TRANSFORMERS.REPETITION_PENALTY" : {
34+
"name" : "OUTPUT_PARAMETERS.REPETITION_PENALTY",
35+
"description" : "A higher repetition penalty (e.g., 1.2 or greater) increases the penalty for repeating words, encouraging the model to generate less repetitive text. Lower values reduce the effect of repetition penalties.",
36+
"type" : "float",
37+
"range" : [1, float("inf")],
38+
"default" : 1.15
39+
},
40+
"INPUT.TRANSFORMERS.SPLIT_CHUNK_SIZE" : {
41+
"name" : "INPUT_PARAMETERS.SPLIT_CHUNK_SIZE",
42+
"description" : "Smaller split chunk sizes (e.g., a few words or a sentence) allow for fine-grained processing but might result in more disjointed or fragmented output. Larger split chunk sizes (e.g., a few paragraphs) provide more context but could approach or exceed the model's input length limits.",
43+
"type" : "integer",
44+
"range" : [1 , float("inf")],
45+
"default" : 800
46+
},
47+
"INPUT.TRANSFORMERS.SPLIT_OVERLAP" : {
48+
"name" : "INPUT_PARAMETERS.SPLIT_OVERLAP",
49+
"description" : "The split size needs to be smaller than the model's maximum input length. It could range from a few tokens to just below the maximum input length, depending on how much overlap you want between adjacent chunks for context continuity.",
50+
"type" : "integer",
51+
"range" : [0 , float("inf")],
52+
"default" : 0
53+
}
54+
}
55+
56+
def get_input_parameters(self):
57+
group_name = "INPUT"
58+
return list(filter(lambda x: group_name in x, self.parameter_dict.keys()))
59+
60+
def get_output_parameters(self):
61+
group_name = "OUTPUT"
62+
return list(filter(lambda x: group_name in x, self.parameter_dict.keys()))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
class PipelineValidator:
2+
def __init__(self) -> None:
3+
print("Validating Pipeline...")
4+
pass
5+
6+
def validate(self, pipeline: dict) -> dict:
7+
if pipeline is None:
8+
print({"message": "Missing Pipeline"})
9+
return False
10+
else:
11+
print({"message": "Pipeline Validated"})
12+
return True
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
from pathlib import Path
3+
4+
class PipelineWrapper:
5+
def __init__(self, config_file_path):
6+
self.config_file_path = config_file_path
7+
8+
def create_config(self, model_name, tasks, options=None):
9+
config = {
10+
"model_name": model_name,
11+
"tasks": tasks,
12+
"options": options or {}
13+
}
14+
15+
with open(self.config_file_path, "w") as config_file:
16+
json.dump(config, config_file, indent=4)
17+
18+
# Usage example
19+
if __name__ == "__main__":
20+
config_path = "pipeline_config.json"
21+
pipeline = PipelineWrapper(config_path)
22+
pipeline.create_config("my_model", ["task1", "task2"], options={"param": "value"})

flock/src/prompt_engineering

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/home/reverie-pc/projects/Flock/flock/prompt_engineering/src

flock/tests/__init__.py

Whitespace-only changes.
142 Bytes
Binary file not shown.
Binary file not shown.

flock/tests/test_db_connection.py

Whitespace-only changes.

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ sentence_transformers
99
accelerate=0.21.0
1010
bitsandbytes
1111
xformers
12-
einops=0.6.1
12+
einops=0.6.1
13+
python-magic=0.4.27

setup.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
import sys
3+
import site
4+
5+
def setup_python_path():
6+
7+
# TBD : Setup log for symlinks
8+
9+
# Add the project root directory to the Python path
10+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ""))
11+
if project_root not in sys.path:
12+
sys.path.append(project_root)
13+
14+
# Get the site-packages directory
15+
site_packages = site.getsitepackages()[0]
16+
17+
# Create a symlink to the project root directory in site-packages
18+
symlink_name = os.path.join(site_packages, "flock")
19+
if not os.path.exists(symlink_name):
20+
os.symlink(os.path.abspath(os.path.dirname(__file__)), symlink_name)
21+
22+
23+
# TBD : setup yml file to setup the site-packages
24+
25+
26+
# Automatically set up the Python path and create symlink when this script is imported
27+
if __name__ == "__main__":
28+
setup_python_path()

0 commit comments

Comments
 (0)