swainshashwat
diff --git a/‎README.md
+3-3 b/‎README.md
+3-3
diff --git a/‎flock/__init__.py b/‎flock/__init__.py
diff --git a/‎flock/__pycache__/__init__.cpython-37.pyc
136 Bytes b/‎flock/__pycache__/__init__.cpython-37.pyc
136 Bytes
diff --git a/‎flock/config/.env
+9 b/‎flock/config/.env
+9
diff --git a/‎flock/create_symlinks.py
-20 b/‎flock/create_symlinks.py
-20
diff --git a/‎flock/embedding_pipeline/src/data_preprocessing/__pycache__/util.cpython-311.pyc
430 Bytes b/‎flock/embedding_pipeline/src/data_preprocessing/__pycache__/util.cpython-311.pyc
430 Bytes
diff --git a/‎flock/embedding_pipeline/src/data_preprocessing/util.py
+2 b/‎flock/embedding_pipeline/src/data_preprocessing/util.py
+2
diff --git a/‎flock/embedding_pipeline/tests/test_main.py b/‎flock/embedding_pipeline/tests/test_main.py
diff --git a/‎flock/setup_project_path.py
+8 b/‎flock/setup_project_path.py
+8
diff --git a/‎flock/src/__pycache__/main.cpython-311.pyc
4.9 KB b/‎flock/src/__pycache__/main.cpython-311.pyc
4.9 KB
diff --git a/‎flock/src/database/__init__.py b/‎flock/src/database/__init__.py
diff --git a/‎flock/src/database/__pycache__/__init__.cpython-37.pyc
145 Bytes b/‎flock/src/database/__pycache__/__init__.cpython-37.pyc
145 Bytes
diff --git a/‎flock/src/database/__pycache__/db_connection.cpython-37.pyc
1.53 KB b/‎flock/src/database/__pycache__/db_connection.cpython-37.pyc
1.53 KB
diff --git a/‎flock/src/database/mongodb_connection.py
+39 b/‎flock/src/database/mongodb_connection.py
+39
diff --git a/‎flock/src/embedding_pipeline/__init__.py
-1 b/‎flock/src/embedding_pipeline/__init__.py
-1
diff --git a/‎flock/src/main.py
+85-2 b/‎flock/src/main.py
+85-2
diff --git a/‎flock/src/pipelineManager/__pycache__/pipelineValidator.cpython-311.pyc
1 KB b/‎flock/src/pipelineManager/__pycache__/pipelineValidator.cpython-311.pyc
1 KB
diff --git a/‎flock/src/pipelineManager/parameters.py
+62 b/‎flock/src/pipelineManager/parameters.py
+62
diff --git a/‎flock/src/pipelineManager/pipelineValidator.py
+12 b/‎flock/src/pipelineManager/pipelineValidator.py
+12
diff --git a/‎flock/src/pipelineManager/pipelineWrapper.py
+22 b/‎flock/src/pipelineManager/pipelineWrapper.py
+22
diff --git a/‎flock/src/prompt_engineering
+1 b/‎flock/src/prompt_engineering
+1
diff --git a/‎flock/tests/__init__.py b/‎flock/tests/__init__.py
diff --git a/‎flock/tests/__pycache__/__init__.cpython-37.pyc
142 Bytes b/‎flock/tests/__pycache__/__init__.cpython-37.pyc
142 Bytes
diff --git a/‎flock/tests/__pycache__/test_db_connection.cpython-37-pytest-5.3.5.pyc
353 Bytes b/‎flock/tests/__pycache__/test_db_connection.cpython-37-pytest-5.3.5.pyc
353 Bytes
diff --git a/‎flock/tests/test_db_connection.py b/‎flock/tests/test_db_connection.py
diff --git a/‎requirements.txt
+2-1 b/‎requirements.txt
+2-1
diff --git a/‎setup.py
+28 b/‎setup.py
+28
@@ -42,9 +42,9 @@ python app.py
 
 ### Phase 1: Setup and Data Collection
 
-- [ ] Set up the project repository with a basic directory structure.
-- [ ] Create a virtual environment and install necessary dependencies.
-- [ ] Implement data collection mechanisms for text and image data.
+- [x] Set up the project repository with a basic directory structure.
+- [x] Create a virtual environment and install necessary dependencies.
+- [x] Implement data collection mechanisms for text and image data.
 - [ ] Preprocess and clean the collected data for further processing.
 
 ### Phase 2: LLM Architecture Integration
 
@@ -0,0 +1,9 @@
+# MongoDB configuration
+MONGODB_URI=mongodb://localhost:27017/
+MONGODB_DB_NAME=flock_database
+
+# ChromaDB configuration
+CHROMADB_URI=your_chromadb_uri
+CHROMADB_DB_NAME=your_chromadb_database_name
+CHROMADB_USERNAME=your_chromadb_username
+CHROMADB_PASSWORD=your_chromadb_password
@@ -0,0 +1,2 @@
+def lowercase(string):
+    return string.lower()
@@ -0,0 +1,8 @@
+import sys
+import os
+
+# Add the project root directory to the Python path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__))))
+
+# Import any common utilities or setup here, if needed
+# ...
@@ -0,0 +1,39 @@
+from pymongo import MongoClient
+
+class MongoDatabase(object):
+    def __init__(self, uri, db_name):
+        self.client = MongoClient(uri)
+        self.db = self.clinet[db_name]
+
+    def get_collection(self, collection_name):
+        return self.db[collection_name]
+    
+    def find_documents(self, collection_name, filter_dict = None, limit=None, use_generator=False):
+        """
+        Find documents in a collection based on a filter.
+
+        :param collection_name: The name of the collection to search.
+        :param filter_dict: A dictionary containing filters (optional).
+        :param limit: Maximum number of documents to retrieve (optional).
+        :param use_generator: Whether to use a generator (default is True).
+        :return: A generator yielding matching documents (if use_generator is True), otherwise a list.
+        """
+        collection = self.get_collection(collection_name)
+        try:
+            if filter_dict is None:
+                cursor = collection.find()
+
+            else:
+                cursor = collection.find(filter_dict)
+
+            if limit:
+                cursor.limit(limit)
+
+            if use_generator:
+                for document in cursor:
+                    yield document
+
+            else:
+                return list(cursor)
+        except Exception as e:
+            return f"Error occured: {str(e)}"
@@ -1,2 +1,85 @@
-# orchestrates the mebedding pipeline
-# loads data, preprocesses it, generates/embeds text
+import uvicorn
+from fastapi import FastAPI, HTTPException, File, UploadFile
+from typing import List, Dict
+import requests, json
+import subprocess
+
+import magic
+
+from pipelineManager.pipelineValidator import PipelineValidator
+
+pipeline_validator = PipelineValidator()
+app = FastAPI()
+
+@app.get('/')
+def root():
+    return {"message": "App is running! :)"}
+
+@app.post("/create-pipeline/")
+def create_pipeline_config(data: List[str], pipeline_config: dict):
+    try:
+        if pipeline_validator.validate(pipeline_config):
+
+            # Save pipeline config to a temporary YAML file -- TBD -- Save in MongoDB
+            with open("pipelines/temp_pipeline_config.yml", "w") as pipeline_json_file_path:
+                json.dumps(pipeline_config, pipeline_json_file_path)
+        
+            # Data processing data_processing.py script with subprocess
+            subprocess.run(["python", "data_processing.py", "temp_pipeline_config.yml"])
+
+            return {"message": "Data processing completed."}
+    
+    except Exception as e:
+        return {"error": str(e)}
+    
+@app.post("/upload-from-link/")
+def upload_from_link(link: str):
+    try:
+        response = requests.get(link)
+        if response.status_code == 200:
+            content_type = response.headers.get("content-type")
+            if "pdf" in content_type.lower():
+                # Process PDF content
+                pdf_content = response.content
+                # Process the PDF content here
+                return {"message": "PDF content received and processed."}
+            elif "text" in content_type.lower() or "html" in content_type.lower():
+                # Process text or HTML content
+                text_content = response.content.decode("utf-8")
+                # Process the text or HTML content here
+                return {"message": "Text or HTML content received and processed."}
+            else:
+                return {"error": "Unsupported content type."}
+        else:
+            return {"error": "Unable to fetch content from link."}
+    except requests.exceptions.RequestException as e:
+        return {"error": "Failed to fetch content from link."}
+    
+@app.post("/upload-from-directory/")
+def upload_from_directory(files: List[UploadFile] = File(...)):
+    try:
+        for file in files:
+            content_type = magic.Magic()
+            file_type = content_type.from_buffer(file.file.read())
+
+            if "pdf" in file_type.lower():
+                # Process PDF file
+                pdf_content = file.file.read()
+
+            elif "text" in file_type.lower() or "html" in file_type.lower():
+                # Process text or HTML file
+                text_content = file.file.read().decode("utf-8")
+                # Process the text or HTML content here
+
+            else:
+                return {"error" : f"Unsupported file type: {file_type}"}
+            
+
+        return {"message": f"Uploaded and processed {len(files)} files from directory."}
+
+
+    except Exception as e:
+        return {"error": str(e)}
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,62 @@
+class Parameter:
+    def __init__(self):
+        self.parameter_dict = {
+                "TRANSFORMERS.MODEL_NAME": {
+                    "name" : "TRANSFORMER_MODEL.MODEL_NAME",
+                    "description" : "Transformer model name",
+                    "type" : "list",
+                    "range" : ['llama2-7b', 'llama2-13b', 'wizardlm', 'bloom','falcon'],
+                    "default": "llama2-7b"
+                },
+                "EMBEDDING.MODEL_NAME" : {
+                    "name" : "EMBEDDING.MODEL_NAME",
+                    "description" : "Emebedding model name",
+                    "type" : "list",
+                    "range" : ['hkunlp/instructor-base'],
+                    "default": "hkunlp/instructor-base"
+                },
+                "OUTPUT.TRANSFORMERS.TEMPERATURE" : {
+                    "name" : "OUTPUT_PARAMETERS.TEMPERATURE",
+                    "description" : "Higher temperature values (e.g., 0.8 to 1.0) lead to more diverse and random output. Lower values (e.g., 0.2 to 0.5) make the output more focused and deterministic.",
+                    "type" : "percentage",
+                    "range" : [0,1],
+                    "default": 0.0
+                },
+                "OUTPUT.TRANSFORMERS.TOP_P" : {
+                    "name" : "OUTPUT_PARAMETERS.TOP_P",
+                    "description" : "Higher top_p values (e.g., 0.8 to 1.0) allow more words to be considered, potentially resulting in more varied output. Lower values (e.g., 0.1 to 0.5) limit the set of words and may produce more controlled responses.",
+                    "type" : "percentage",
+                    "range" : [0, 1],
+                    "default" : 0.95
+                },
+                
+                "OUTPUT.TRANSFORMERS.REPETITION_PENALTY" : {
+                    "name" : "OUTPUT_PARAMETERS.REPETITION_PENALTY",
+                    "description" : "A higher repetition penalty (e.g., 1.2 or greater) increases the penalty for repeating words, encouraging the model to generate less repetitive text. Lower values reduce the effect of repetition penalties.",
+                    "type" : "float",
+                    "range" : [1, float("inf")],
+                    "default" : 1.15     
+                },
+                "INPUT.TRANSFORMERS.SPLIT_CHUNK_SIZE" : {
+                    "name" : "INPUT_PARAMETERS.SPLIT_CHUNK_SIZE",
+                    "description" : "Smaller split chunk sizes (e.g., a few words or a sentence) allow for fine-grained processing but might result in more disjointed or fragmented output. Larger split chunk sizes (e.g., a few paragraphs) provide more context but could approach or exceed the model's input length limits.",
+                    "type" : "integer",
+                    "range" : [1 , float("inf")],
+                    "default" : 800
+                },
+                "INPUT.TRANSFORMERS.SPLIT_OVERLAP" : {
+                    "name" : "INPUT_PARAMETERS.SPLIT_OVERLAP",
+                    "description" : "The split size needs to be smaller than the model's maximum input length. It could range from a few tokens to just below the maximum input length, depending on how much overlap you want between adjacent chunks for context continuity.",
+                    "type" : "integer",
+                    "range" : [0 , float("inf")],
+                    "default" : 0
+                }
+        }
+
+    def get_input_parameters(self):
+        group_name = "INPUT"
+        return list(filter(lambda x: group_name in x, self.parameter_dict.keys()))
+    
+    def get_output_parameters(self):
+        group_name = "OUTPUT"
+        return list(filter(lambda x: group_name in x, self.parameter_dict.keys()))
@@ -0,0 +1,12 @@
+class PipelineValidator:
+    def __init__(self) -> None:
+        print("Validating Pipeline...")
+        pass
+
+    def validate(self, pipeline: dict) -> dict:
+        if pipeline is None:
+            print({"message": "Missing Pipeline"})
+            return False
+        else:
+            print({"message": "Pipeline Validated"})
+            return True
@@ -0,0 +1,22 @@
+import json
+from pathlib import Path
+
+class PipelineWrapper:
+    def __init__(self, config_file_path):
+        self.config_file_path = config_file_path
+
+    def create_config(self, model_name, tasks, options=None):
+        config = {
+            "model_name": model_name,
+            "tasks": tasks,
+            "options": options or {}
+        }
+
+        with open(self.config_file_path, "w") as config_file:
+            json.dump(config, config_file, indent=4)
+
+# Usage example
+if __name__ == "__main__":
+    config_path = "pipeline_config.json"
+    pipeline = PipelineWrapper(config_path)
+    pipeline.create_config("my_model", ["task1", "task2"], options={"param": "value"})
@@ -0,0 +1 @@
+/home/reverie-pc/projects/Flock/flock/prompt_engineering/src
@@ -9,4 +9,5 @@ sentence_transformers
 accelerate=0.21.0
 bitsandbytes
 xformers
-einops=0.6.1
+einops=0.6.1
+python-magic=0.4.27
@@ -0,0 +1,28 @@
+import os
+import sys
+import site
+
+def setup_python_path():
+
+    # TBD : Setup log for symlinks 
+
+    # Add the project root directory to the Python path
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ""))
+    if project_root not in sys.path:
+        sys.path.append(project_root)
+
+    # Get the site-packages directory
+    site_packages = site.getsitepackages()[0]
+
+    # Create a symlink to the project root directory in site-packages
+    symlink_name = os.path.join(site_packages, "flock")
+    if not os.path.exists(symlink_name):
+        os.symlink(os.path.abspath(os.path.dirname(__file__)), symlink_name)
+    
+
+    # TBD : setup yml file to setup the site-packages
+    
+
+# Automatically set up the Python path and create symlink when this script is imported
+if __name__ == "__main__":
+    setup_python_path()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+def lowercase(string):`
	`2`	`+ return string.lower()`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+/home/reverie-pc/projects/Flock/flock/prompt_engineering/src`