Inital Commit

2026-02-04 03:34:34 -05:00 · 2026-01-24 02:32:25 +00:00
commit a4b7c82b1a
54 changed files with 888 additions and 0 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -0,0 +1,24 @@
+# Use a lightweight Python image
+FROM python:3.9-slim
+
+# Set working directory inside the container
+WORKDIR /app
+
+# Copy requirements first (for better caching)
+COPY requirements.txt .
+
+# Install dependencies
+# 'gunicorn' must be in your requirements.txt or installed here
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install gunicorn
+
+# Copy the rest of the application
+COPY . .
+
+# Expose the internal port (Gunicorn default is 8000, or we choose one)
+EXPOSE 5000
+
+# Command to run production server
+# -w 4: 4 worker processes
+# -b 0.0.0.0:5000: Bind to all interfaces inside container on port 5000
+CMD ["gunicorn", "--workers", "4", "--bind", "0.0.0.0:5000", "app:app"]
--- a/backend/app.py
+++ b/backend/app.py
@@ -0,0 +1,9 @@
+from dotenv import load_dotenv
+load_dotenv()
+
+from src import create_app
+
+app = create_app()
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=5000)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,12 @@
+flask
+google-genai
+gunicorn
+pymongo
+ultralytics
+opencv-python-headless
+transformers
+torch
+pandas
+pypdf
+python-dotenv
+flask-cors
--- a/backend/src/init.py
+++ b/backend/src/init.py
@@ -0,0 +1,14 @@
+from flask import Flask
+from flask_cors import CORS
+from .routes.main import main_bp
+from .routes.rag import rag_bp
+
+def create_app():
+    app = Flask(__name__)
+    CORS(app) # Enable CORS for all routes
+    
+    # Register Blueprints
+    app.register_blueprint(main_bp)
+    app.register_blueprint(rag_bp, url_prefix='/api/rag')
+    
+    return app
--- a/backend/src/cv/init.py
+++ b/backend/src/cv/init.py
--- a/backend/src/gemini/init.py
+++ b/backend/src/gemini/init.py
--- a/backend/src/gemini/client.py
+++ b/backend/src/gemini/client.py
@@ -0,0 +1,21 @@
+from google import genai
+import os
+
+def generate_content(prompt, model_name="gemini-2.0-flash-exp"):
+    """
+    Generates content using the Google GenAI SDK.
+    Defaults to gemini-2.0-flash-exp as per request (or similar).
+    """
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        return "Error: GOOGLE_API_KEY not found."
+
+    try:
+        client = genai.Client(api_key=api_key)
+        response = client.models.generate_content(
+            model=model_name,
+            contents=prompt,
+        )
+        return response.text
+    except Exception as e:
+        return f"Error interacting with Gemini API: {str(e)}"
--- a/backend/src/mongo/init.py
+++ b/backend/src/mongo/init.py
--- a/backend/src/rag/embeddings.py
+++ b/backend/src/rag/embeddings.py
@@ -0,0 +1,33 @@
+from google import genai
+import os
+
+def get_embedding(text, model="gemini-embedding-001"):
+    """
+    Generates an embedding for the given text using the Gemini API.
+    """
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        raise ValueError("GOOGLE_API_KEY environment variable not set")
+    
+    client = genai.Client(api_key=api_key)
+    result = client.models.embed_content(
+        model=model,
+        contents=text
+    )
+    return result.embeddings[0].values
+
+def get_embeddings_batch(texts, model="gemini-embedding-001"):
+    """
+    Generates embeddings for a list of texts.
+    """
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        raise ValueError("GOOGLE_API_KEY environment variable not set")
+        
+    client = genai.Client(api_key=api_key)
+    result = client.models.embed_content(
+        model=model,
+        contents=texts
+    )
+    # The SDK returns a list of embedding objects
+    return [emb.values for emb in result.embeddings]
--- a/backend/src/rag/ingest.py
+++ b/backend/src/rag/ingest.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from pypdf import PdfReader
+import io
+import os
+
+def load_csv(file_path):
+    """
+    Loads a CSV file and returns a list of strings (one per row).
+    This is a simplistic implementation - in production you might want specific columns.
+    """
+    df = pd.read_csv(file_path)
+    # Convert each row to a string representation
+    return df.apply(lambda x: ' | '.join(x.astype(str)), axis=1).tolist()
+
+def load_pdf(file_path):
+    """
+    Loads a PDF file and returns a list of strings (one per page).
+    """
+    reader = PdfReader(file_path)
+    text_chunks = []
+    for page in reader.pages:
+        text = page.extract_text()
+        if text:
+            text_chunks.append(text)
+    return text_chunks
+
+def process_file(file_path):
+    """
+    Determines file type and returns text chunks.
+    """
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == '.csv':
+        return load_csv(file_path)
+    elif ext == '.pdf':
+        return load_pdf(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
--- a/backend/src/rag/store.py
+++ b/backend/src/rag/store.py
@@ -0,0 +1,67 @@
+import os
+from pymongo import MongoClient
+from .embeddings import get_embeddings_batch
+
+def get_mongo_client():
+    uri = os.environ.get("MONGO_URI")
+    if not uri:
+        raise ValueError("MONGO_URI environment variable not set")
+    return MongoClient(uri)
+
+def ingest_documents(text_chunks, collection_name="rag_documents"):
+    """
+    Generates embeddings for text chunks and stores them in MongoDB.
+    """
+    client = get_mongo_client()
+    db = client.get_database("vectors_db") # Default DB name
+    collection = db[collection_name]
+    
+    # Generate embeddings in batches (handling API limits might be needed for large sets)
+    embeddings = get_embeddings_batch(text_chunks)
+    
+    documents = []
+    for text, embedding in zip(text_chunks, embeddings):
+        documents.append({
+            "text": text,
+            "embedding": embedding
+        })
+    
+    if documents:
+        collection.insert_many(documents)
+        return len(documents)
+    return 0
+
+def vector_search(query_text, collection_name="rag_documents", num_results=5):
+    """
+    Performs a vector search in MongoDB.
+    """
+    # 1. Get embedding for the query
+    from .embeddings import get_embedding
+    query_embedding = get_embedding(query_text)
+    
+    client = get_mongo_client()
+    db = client.get_database("vectors_db")
+    collection = db[collection_name]
+    
+    # Note: You must have a vector search index defined in MongoDB Atlas for this to work.
+    pipeline = [
+        {
+            "$vectorSearch": {
+                "index": "vector_index",
+                "path": "embedding",
+                "queryVector": query_embedding,
+                "numCandidates": num_results * 10,
+                "limit": num_results
+            }
+        },
+        {
+            "$project": {
+                "_id": 0,
+                "text": 1,
+                "score": { "$meta": "vectorSearchScore" }
+            }
+        }
+    ]
+    
+    results = list(collection.aggregate(pipeline))
+    return results
--- a/backend/src/routes/init.py
+++ b/backend/src/routes/init.py
--- a/backend/src/routes/main.py
+++ b/backend/src/routes/main.py
@@ -0,0 +1,7 @@
+from flask import Blueprint
+
+main_bp = Blueprint('main', __name__)
+
+@main_bp.route('/')
+def index():
+    return "Hello from the organized Flask App!"
--- a/backend/src/routes/rag.py
+++ b/backend/src/routes/rag.py
@@ -0,0 +1,24 @@
+from flask import Blueprint, request, jsonify
+from ..rag.store import vector_search, ingest_documents
+
+rag_bp = Blueprint('rag', __name__)
+
+@rag_bp.route('/ingest', methods=['POST'])
+def ingest():
+    data = request.json
+    text_chunks = data.get('chunks', [])
+    if not text_chunks:
+        return jsonify({"error": "No chunks provided"}), 400
+    
+    count = ingest_documents(text_chunks)
+    return jsonify({"message": f"Ingested {count} documents"}), 201
+
+@rag_bp.route('/search', methods=['POST'])
+def search():
+    data = request.json
+    query = data.get('query')
+    if not query:
+        return jsonify({"error": "No query provided"}), 400
+    
+    results = vector_search(query)
+    return jsonify({"results": results}), 200