mirror of
https://github.com/SirBlobby/Hoya26.git
synced 2026-02-04 03:34:34 -05:00
Inital Commit
This commit is contained in:
14
backend/src/__init__.py
Normal file
14
backend/src/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from flask import Flask
|
||||
from flask_cors import CORS
|
||||
from .routes.main import main_bp
|
||||
from .routes.rag import rag_bp
|
||||
|
||||
def create_app():
|
||||
app = Flask(__name__)
|
||||
CORS(app) # Enable CORS for all routes
|
||||
|
||||
# Register Blueprints
|
||||
app.register_blueprint(main_bp)
|
||||
app.register_blueprint(rag_bp, url_prefix='/api/rag')
|
||||
|
||||
return app
|
||||
0
backend/src/cv/__init__.py
Normal file
0
backend/src/cv/__init__.py
Normal file
0
backend/src/gemini/__init__.py
Normal file
0
backend/src/gemini/__init__.py
Normal file
21
backend/src/gemini/client.py
Normal file
21
backend/src/gemini/client.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
def generate_content(prompt, model_name="gemini-2.0-flash-exp"):
|
||||
"""
|
||||
Generates content using the Google GenAI SDK.
|
||||
Defaults to gemini-2.0-flash-exp as per request (or similar).
|
||||
"""
|
||||
api_key = os.environ.get("GOOGLE_API_KEY")
|
||||
if not api_key:
|
||||
return "Error: GOOGLE_API_KEY not found."
|
||||
|
||||
try:
|
||||
client = genai.Client(api_key=api_key)
|
||||
response = client.models.generate_content(
|
||||
model=model_name,
|
||||
contents=prompt,
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
return f"Error interacting with Gemini API: {str(e)}"
|
||||
0
backend/src/mongo/__init__.py
Normal file
0
backend/src/mongo/__init__.py
Normal file
33
backend/src/rag/embeddings.py
Normal file
33
backend/src/rag/embeddings.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
def get_embedding(text, model="gemini-embedding-001"):
|
||||
"""
|
||||
Generates an embedding for the given text using the Gemini API.
|
||||
"""
|
||||
api_key = os.environ.get("GOOGLE_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
result = client.models.embed_content(
|
||||
model=model,
|
||||
contents=text
|
||||
)
|
||||
return result.embeddings[0].values
|
||||
|
||||
def get_embeddings_batch(texts, model="gemini-embedding-001"):
|
||||
"""
|
||||
Generates embeddings for a list of texts.
|
||||
"""
|
||||
api_key = os.environ.get("GOOGLE_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
result = client.models.embed_content(
|
||||
model=model,
|
||||
contents=texts
|
||||
)
|
||||
# The SDK returns a list of embedding objects
|
||||
return [emb.values for emb in result.embeddings]
|
||||
37
backend/src/rag/ingest.py
Normal file
37
backend/src/rag/ingest.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pandas as pd
|
||||
from pypdf import PdfReader
|
||||
import io
|
||||
import os
|
||||
|
||||
def load_csv(file_path):
|
||||
"""
|
||||
Loads a CSV file and returns a list of strings (one per row).
|
||||
This is a simplistic implementation - in production you might want specific columns.
|
||||
"""
|
||||
df = pd.read_csv(file_path)
|
||||
# Convert each row to a string representation
|
||||
return df.apply(lambda x: ' | '.join(x.astype(str)), axis=1).tolist()
|
||||
|
||||
def load_pdf(file_path):
|
||||
"""
|
||||
Loads a PDF file and returns a list of strings (one per page).
|
||||
"""
|
||||
reader = PdfReader(file_path)
|
||||
text_chunks = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_chunks.append(text)
|
||||
return text_chunks
|
||||
|
||||
def process_file(file_path):
|
||||
"""
|
||||
Determines file type and returns text chunks.
|
||||
"""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
if ext == '.csv':
|
||||
return load_csv(file_path)
|
||||
elif ext == '.pdf':
|
||||
return load_pdf(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {ext}")
|
||||
67
backend/src/rag/store.py
Normal file
67
backend/src/rag/store.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
from .embeddings import get_embeddings_batch
|
||||
|
||||
def get_mongo_client():
|
||||
uri = os.environ.get("MONGO_URI")
|
||||
if not uri:
|
||||
raise ValueError("MONGO_URI environment variable not set")
|
||||
return MongoClient(uri)
|
||||
|
||||
def ingest_documents(text_chunks, collection_name="rag_documents"):
|
||||
"""
|
||||
Generates embeddings for text chunks and stores them in MongoDB.
|
||||
"""
|
||||
client = get_mongo_client()
|
||||
db = client.get_database("vectors_db") # Default DB name
|
||||
collection = db[collection_name]
|
||||
|
||||
# Generate embeddings in batches (handling API limits might be needed for large sets)
|
||||
embeddings = get_embeddings_batch(text_chunks)
|
||||
|
||||
documents = []
|
||||
for text, embedding in zip(text_chunks, embeddings):
|
||||
documents.append({
|
||||
"text": text,
|
||||
"embedding": embedding
|
||||
})
|
||||
|
||||
if documents:
|
||||
collection.insert_many(documents)
|
||||
return len(documents)
|
||||
return 0
|
||||
|
||||
def vector_search(query_text, collection_name="rag_documents", num_results=5):
|
||||
"""
|
||||
Performs a vector search in MongoDB.
|
||||
"""
|
||||
# 1. Get embedding for the query
|
||||
from .embeddings import get_embedding
|
||||
query_embedding = get_embedding(query_text)
|
||||
|
||||
client = get_mongo_client()
|
||||
db = client.get_database("vectors_db")
|
||||
collection = db[collection_name]
|
||||
|
||||
# Note: You must have a vector search index defined in MongoDB Atlas for this to work.
|
||||
pipeline = [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"index": "vector_index",
|
||||
"path": "embedding",
|
||||
"queryVector": query_embedding,
|
||||
"numCandidates": num_results * 10,
|
||||
"limit": num_results
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"_id": 0,
|
||||
"text": 1,
|
||||
"score": { "$meta": "vectorSearchScore" }
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
results = list(collection.aggregate(pipeline))
|
||||
return results
|
||||
0
backend/src/routes/__init__.py
Normal file
0
backend/src/routes/__init__.py
Normal file
7
backend/src/routes/main.py
Normal file
7
backend/src/routes/main.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from flask import Blueprint
|
||||
|
||||
main_bp = Blueprint('main', __name__)
|
||||
|
||||
@main_bp.route('/')
|
||||
def index():
|
||||
return "Hello from the organized Flask App!"
|
||||
24
backend/src/routes/rag.py
Normal file
24
backend/src/routes/rag.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from flask import Blueprint, request, jsonify
|
||||
from ..rag.store import vector_search, ingest_documents
|
||||
|
||||
rag_bp = Blueprint('rag', __name__)
|
||||
|
||||
@rag_bp.route('/ingest', methods=['POST'])
|
||||
def ingest():
|
||||
data = request.json
|
||||
text_chunks = data.get('chunks', [])
|
||||
if not text_chunks:
|
||||
return jsonify({"error": "No chunks provided"}), 400
|
||||
|
||||
count = ingest_documents(text_chunks)
|
||||
return jsonify({"message": f"Ingested {count} documents"}), 201
|
||||
|
||||
@rag_bp.route('/search', methods=['POST'])
|
||||
def search():
|
||||
data = request.json
|
||||
query = data.get('query')
|
||||
if not query:
|
||||
return jsonify({"error": "No query provided"}), 400
|
||||
|
||||
results = vector_search(query)
|
||||
return jsonify({"results": results}), 200
|
||||
Reference in New Issue
Block a user