Inital Commit

This commit is contained in:
2026-01-24 02:32:25 +00:00
commit a4b7c82b1a
54 changed files with 888 additions and 0 deletions

24
backend/Dockerfile Normal file
View File

@@ -0,0 +1,24 @@
# Use a lightweight Python image
FROM python:3.9-slim
# Set working directory inside the container
WORKDIR /app
# Copy requirements first (for better caching)
COPY requirements.txt .
# Install dependencies
# 'gunicorn' must be in your requirements.txt or installed here
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install gunicorn
# Copy the rest of the application
COPY . .
# Expose the internal port (Gunicorn default is 8000, or we choose one)
EXPOSE 5000
# Command to run production server
# -w 4: 4 worker processes
# -b 0.0.0.0:5000: Bind to all interfaces inside container on port 5000
CMD ["gunicorn", "--workers", "4", "--bind", "0.0.0.0:5000", "app:app"]

9
backend/app.py Normal file
View File

@@ -0,0 +1,9 @@
from dotenv import load_dotenv
load_dotenv()
from src import create_app
app = create_app()
if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000)

12
backend/requirements.txt Normal file
View File

@@ -0,0 +1,12 @@
flask
google-genai
gunicorn
pymongo
ultralytics
opencv-python-headless
transformers
torch
pandas
pypdf
python-dotenv
flask-cors

14
backend/src/__init__.py Normal file
View File

@@ -0,0 +1,14 @@
from flask import Flask
from flask_cors import CORS
from .routes.main import main_bp
from .routes.rag import rag_bp
def create_app():
app = Flask(__name__)
CORS(app) # Enable CORS for all routes
# Register Blueprints
app.register_blueprint(main_bp)
app.register_blueprint(rag_bp, url_prefix='/api/rag')
return app

View File

View File

View File

@@ -0,0 +1,21 @@
from google import genai
import os
def generate_content(prompt, model_name="gemini-2.0-flash-exp"):
"""
Generates content using the Google GenAI SDK.
Defaults to gemini-2.0-flash-exp as per request (or similar).
"""
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
return "Error: GOOGLE_API_KEY not found."
try:
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model=model_name,
contents=prompt,
)
return response.text
except Exception as e:
return f"Error interacting with Gemini API: {str(e)}"

View File

View File

@@ -0,0 +1,33 @@
from google import genai
import os
def get_embedding(text, model="gemini-embedding-001"):
"""
Generates an embedding for the given text using the Gemini API.
"""
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY environment variable not set")
client = genai.Client(api_key=api_key)
result = client.models.embed_content(
model=model,
contents=text
)
return result.embeddings[0].values
def get_embeddings_batch(texts, model="gemini-embedding-001"):
"""
Generates embeddings for a list of texts.
"""
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY environment variable not set")
client = genai.Client(api_key=api_key)
result = client.models.embed_content(
model=model,
contents=texts
)
# The SDK returns a list of embedding objects
return [emb.values for emb in result.embeddings]

37
backend/src/rag/ingest.py Normal file
View File

@@ -0,0 +1,37 @@
import pandas as pd
from pypdf import PdfReader
import io
import os
def load_csv(file_path):
"""
Loads a CSV file and returns a list of strings (one per row).
This is a simplistic implementation - in production you might want specific columns.
"""
df = pd.read_csv(file_path)
# Convert each row to a string representation
return df.apply(lambda x: ' | '.join(x.astype(str)), axis=1).tolist()
def load_pdf(file_path):
"""
Loads a PDF file and returns a list of strings (one per page).
"""
reader = PdfReader(file_path)
text_chunks = []
for page in reader.pages:
text = page.extract_text()
if text:
text_chunks.append(text)
return text_chunks
def process_file(file_path):
"""
Determines file type and returns text chunks.
"""
ext = os.path.splitext(file_path)[1].lower()
if ext == '.csv':
return load_csv(file_path)
elif ext == '.pdf':
return load_pdf(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}")

67
backend/src/rag/store.py Normal file
View File

@@ -0,0 +1,67 @@
import os
from pymongo import MongoClient
from .embeddings import get_embeddings_batch
def get_mongo_client():
uri = os.environ.get("MONGO_URI")
if not uri:
raise ValueError("MONGO_URI environment variable not set")
return MongoClient(uri)
def ingest_documents(text_chunks, collection_name="rag_documents"):
"""
Generates embeddings for text chunks and stores them in MongoDB.
"""
client = get_mongo_client()
db = client.get_database("vectors_db") # Default DB name
collection = db[collection_name]
# Generate embeddings in batches (handling API limits might be needed for large sets)
embeddings = get_embeddings_batch(text_chunks)
documents = []
for text, embedding in zip(text_chunks, embeddings):
documents.append({
"text": text,
"embedding": embedding
})
if documents:
collection.insert_many(documents)
return len(documents)
return 0
def vector_search(query_text, collection_name="rag_documents", num_results=5):
"""
Performs a vector search in MongoDB.
"""
# 1. Get embedding for the query
from .embeddings import get_embedding
query_embedding = get_embedding(query_text)
client = get_mongo_client()
db = client.get_database("vectors_db")
collection = db[collection_name]
# Note: You must have a vector search index defined in MongoDB Atlas for this to work.
pipeline = [
{
"$vectorSearch": {
"index": "vector_index",
"path": "embedding",
"queryVector": query_embedding,
"numCandidates": num_results * 10,
"limit": num_results
}
},
{
"$project": {
"_id": 0,
"text": 1,
"score": { "$meta": "vectorSearchScore" }
}
}
]
results = list(collection.aggregate(pipeline))
return results

View File

View File

@@ -0,0 +1,7 @@
from flask import Blueprint
main_bp = Blueprint('main', __name__)
@main_bp.route('/')
def index():
return "Hello from the organized Flask App!"

24
backend/src/routes/rag.py Normal file
View File

@@ -0,0 +1,24 @@
from flask import Blueprint, request, jsonify
from ..rag.store import vector_search, ingest_documents
rag_bp = Blueprint('rag', __name__)
@rag_bp.route('/ingest', methods=['POST'])
def ingest():
data = request.json
text_chunks = data.get('chunks', [])
if not text_chunks:
return jsonify({"error": "No chunks provided"}), 400
count = ingest_documents(text_chunks)
return jsonify({"message": f"Ingested {count} documents"}), 201
@rag_bp.route('/search', methods=['POST'])
def search():
data = request.json
query = data.get('query')
if not query:
return jsonify({"error": "No query provided"}), 400
results = vector_search(query)
return jsonify({"results": results}), 200