Populate DB Chromadb

2026-02-04 03:34:34 -05:00 · 2026-01-24 07:52:48 +00:00
parent d145f7e94c
commit 4298368b63
10 changed files with 279 additions and 48 deletions
--- a/backend/src/chroma/vector_store.py
+++ b/backend/src/chroma/vector_store.py
@@ -0,0 +1,69 @@
+import chromadb
+
+CHROMA_HOST = "http://chroma.sirblob.co"
+COLLECTION_NAME = "rag_documents"
+
+_client = None
+
+def get_chroma_client():
+    global _client
+    if _client is None:
+        _client = chromadb.HttpClient(host=CHROMA_HOST)
+    return _client
+
+def get_collection(collection_name=COLLECTION_NAME):
+    client = get_chroma_client()
+    return client.get_or_create_collection(name=collection_name)
+
+def insert_documents(texts, embeddings, collection_name=COLLECTION_NAME, metadata_list=None):
+    collection = get_collection(collection_name)
+    
+    ids = [f"doc_{i}_{hash(text)}" for i, text in enumerate(texts)]
+    
+    if metadata_list:
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=metadata_list
+        )
+    else:
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts
+        )
+    
+    return len(texts)
+
+def search_documents(query_embedding, collection_name=COLLECTION_NAME, num_results=5, filter_metadata=None):
+    collection = get_collection(collection_name)
+    
+    query_params = {
+        "query_embeddings": [query_embedding],
+        "n_results": num_results
+    }
+    
+    if filter_metadata:
+        query_params["where"] = filter_metadata
+    
+    results = collection.query(**query_params)
+    
+    output = []
+    if results and results["documents"]:
+        for i, doc in enumerate(results["documents"][0]):
+            score = results["distances"][0][i] if "distances" in results else None
+            output.append({
+                "text": doc,
+                "score": score
+            })
+    
+    return output
+
+def delete_documents_by_source(source_file, collection_name=COLLECTION_NAME):
+    collection = get_collection(collection_name)
+    results = collection.get(where={"source": source_file})
+    if results["ids"]:
+        collection.delete(ids=results["ids"])
+        return len(results["ids"])
+    return 0