使用示例#
本节提供了各种场景中如何使用 VectorStore 模块的实用示例。
基本文本搜索#
简单的文本搜索功能:
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
from evofabric.core.clients import SentenceTransformerEmbed
async def main():
# Initialize embedding client
embed_client = SentenceTransformerEmbed(
model="all-MiniLM-L6-v2",
device="cpu"
)
# Initialize vector store
vector_store = ChromaDB(
collection_name="documents",
persist_directory="./db",
embedding=embed_client,
top_k=5
)
# Add documents
documents = [
DBItem(text="The quick brown fox jumps over the lazy dog"),
DBItem(text="A journey of a thousand miles begins with a single step"),
DBItem(text="To be or not to be, that is the question")
]
await vector_store.add_texts(documents)
# Perform similarity search
results = await vector_store.similarity_search("animal jumping")
for result in results:
print(f"Match: {result.text}")
# Run the function
if __name__ == "__main__":
import asyncio
asyncio.run(main())
基于元数据的搜索#
使用元数据进行过滤搜索:
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
from evofabric.core.clients import SentenceTransformerEmbed
# Initialize embedding client
embed_client = SentenceTransformerEmbed(
model="all-MiniLM-L6-v2",
device="cpu"
)
# Initialize vector store
vector_store = ChromaDB(
collection_name="technical_docs",
persist_directory="./tech_db",
embedding=embed_client,
top_k=5
)
# Add documents with metadata
technical_docs = [
DBItem(
text="Introduction to machine learning algorithms",
metadata={"category": "ML", "difficulty": "beginner", "length": "short"}
),
DBItem(
text="Advanced deep learning architectures",
metadata={"category": "DL", "difficulty": "advanced", "length": "long"}
),
DBItem(
text="Python programming basics",
metadata={"category": "programming", "difficulty": "beginner", "length": "medium"}
)
]
await vector_store.add_texts(technical_docs)
# Search with metadata filter
filter_results = await vector_store.similarity_search(
query="learning algorithms",
filter={"category": "ML", "difficulty": "beginner"}
)
for result in filter_results:
print(f"Filtered result: {result.text} (score: {result.score})")
批处理#
高效处理大型文档集合:
import asyncio
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
async def process_large_dataset():
vector_store = ChromaDB(
collection_name="large_dataset",
persist_directory="./large_db",
embedding=your_embed_client,
top_k=10
)
# Generate sample documents
batch_size = 100
all_documents = []
for i in range(1000):
doc = DBItem(
text=f"Document {i}: This is sample content for document number {i}",
metadata={"batch": i // batch_size, "doc_id": i}
)
all_documents.append(doc)
# Process in batches
for i in range(0, len(all_documents), batch_size):
batch = all_documents[i:i + batch_size]
doc_ids = await vector_store.add_texts(batch)
print(f"Processed batch {i//batch_size + 1}: {len(doc_ids)} documents")
# Search entire dataset
results = await vector_store.similarity_search("Find documents containing content")
print(f"Found {len(results)} matching documents")
asyncio.run(process_large_dataset())
文档管理#
添加、更新和删除文档:
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
async def main():
vector_store = ChromaDB(
collection_name="doc_management",
persist_directory="./doc_db",
embedding=your_embed_client
)
# Add documents with IDs
docs_with_ids = [
DBItem(
text="Initial document content",
ids="doc_001",
metadata={"status": "published"}
),
DBItem(
text="Second document content",
ids="doc_002",
metadata={"status": "draft"}
)
]
await vector_store.add_texts(docs_with_ids)
# Update document by re-adding with same ID
updated_doc = DBItem(
text="Updated document content",
ids="doc_001",
metadata={"status": "published", "version": "2"}
)
await vector_store.add_texts([updated_doc])
# Delete document by ID
await vector_store.delete_by_ids(["doc_002"])
# Verify deletion
remaining_docs = await vector_store.similarity_search("document")
print(f"Remaining documents: {len(remaining_docs)}")
# Run the function
if __name__ == "__main__":
import asyncio
asyncio.run(main())
RAG应用集成#
向量化存储在检索增强生成系统中的应用:
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
from evofabric.core.clients import SentenceTransformerEmbed, OpenAIChatClient
class RAGSystem:
def __init__(self):
# Use local SentenceTransformer embedding for RAG
self.embed_client = SentenceTransformerEmbed(
model="all-MiniLM-L6-v2",
device="cpu"
)
self.vector_store = ChromaDB(
collection_name="knowledge_base",
persist_directory="./rag_db",
embedding=self.embed_client,
top_k=3
)
self.llm = OpenAIChatClient(api_key="your-key")
async def add_knowledge(self, documents):
"""Add knowledge documents to vector store"""
db_items = [DBItem(text=doc) for doc in documents]
await self.vector_store.add_texts(db_items)
async def ask_question(self, question):
"""Ask a question and get RAG-enhanced answer"""
# Retrieve relevant documents
relevant_docs = await self.vector_store.similarity_search(question)
if not relevant_docs:
return "I don't have enough context to answer this question."
# Format context
context = "\n\n".join([doc.text for doc in relevant_docs])
# Create prompt
prompt = f"""
Context:
{context}
Question: {question}
Please answer the question based on the provided context.
"""
# Get LLM response
response = await self.llm.generate(prompt)
return response
# Usage example
async def main():
rag_system = RAGSystem()
# Add knowledge
knowledge_docs = [
"EvoFabric is a distributed framework for building AI applications",
"It supports multiple backends and provides high-performance processing",
"The framework is designed for scalability and reliability"
]
await rag_system.add_knowledge(knowledge_docs)
# Ask question
answer = await rag_system.ask_question("What is EvoFabric?")
print(answer)
# Run the example
if __name__ == "__main__":
import asyncio
asyncio.run(main())
错误处理和恢复#
错误处理模式:
import asyncio
from evofabric.core.vectorstore import ChromaDB
from evofabric.core.typing import DBItem
async def robust_vector_store_operations():
vector_store = ChromaDB(
collection_name="robust_example",
persist_directory="./robust_db",
embedding=your_embed_client
)
try:
# Add documents with error handling
try:
documents = [
DBItem(text="Document 1"),
DBItem(text="Document 2")
]
doc_ids = await vector_store.add_texts(documents)
print(f"Successfully added {len(doc_ids)} documents")
except Exception as e:
print(f"Error adding documents: {e}")
# Retry logic or fallback strategy
return
# Search with error handling
try:
results = await vector_store.similarity_search("test query")
print(f"Found {len(results)} results")
except Exception as e:
print(f"Error during search: {e}")
# Implement fallback search or error response
return
# Get collection information with error handling
try:
info = vector_store.get_collection_info()
print(f"Collection information: {info}")
except Exception as e:
print(f"Error getting collection information: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
# Global error handling and recovery
asyncio.run(robust_vector_store_operations())