Traditional keyword search fails when users phrase queries
differently than your content. “machine learning tutorial” won’t match “AI crash course,” even though they’re
semantically identical. Embeddings solve this by converting text into numerical vectors that capture meaning,
enabling search based on semantic similarity rather than exact word matches.
This guide covers production-ready embedding
implementations, from choosing models to scaling vector databases. We’ll build semantic search systems that
understand user intent, not just keywords.
Why Embeddings Beat Keyword Search
The Keyword Search Problem
Traditional search fails with:
- Synonym mismatch: “cheap laptop” vs “affordable computer”
- Different phrasing: “how to deploy” vs “deployment guide”
- No context: “bank” (financial vs river) requires context
- Typos and variants: “machne learning” vs “machine learning”
- Multilingual gaps: Can’t search across languages
How Embeddings Work
Embeddings convert text to vectors in high-dimensional space where:
- Similar meanings = close vectors
- Different meanings = distant vectors
- Relationships preserved mathematically
- Language-agnostic representations possible
Pattern 1: OpenAI Embeddings
Simple Implementation
from openai import OpenAI
import numpy as np
client = OpenAI(api_key="your-api-key")
def get_embedding(text: str, model="text-embedding-3-small") -> list[float]:
"""Generate embedding for text using OpenAI"""
text = text.replace("\n", " ")
response = client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
"""Calculate similarity between two vectors"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
# Usage
query = "machine learning tutorial"
doc1 = "Learn AI basics with Python"
doc2 = "Cooking recipes for beginners"
query_vec = get_embedding(query)
doc1_vec = get_embedding(doc1)
doc2_vec = get_embedding(doc2)
similarity1 = cosine_similarity(query_vec, doc1_vec)
similarity2 = cosine_similarity(query_vec, doc2_vec)
print(f"Query-Doc1 similarity: {similarity1:.3f}") # High (~0.85)
print(f"Query-Doc2 similarity: {similarity2:.3f}") # Low (~0.15)
Model Selection
| Model | Dimensions | Cost | Best For |
|---|---|---|---|
| text-embedding-3-small | 1536 | $0.02/1M tokens | General purpose, cost-effective |
| text-embedding-3-large | 3072 | $0.13/1M tokens | Higher precision needed |
| text-embedding-ada-002 | 1536 | $0.10/1M tokens | Legacy, being phased out |
Pattern 2: Production Vector Database
Pinecone Implementation
from pinecone import Pinecone, ServerlessSpec
import openai
# Initialize Pinecone
pc = Pinecone(api_key="your-pinecone-key")
# Create index
index_name = "semantic-search"
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=1536, # OpenAI embedding size
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)
# Add documents with embeddings
def add_documents(documents: list[dict]):
"""Batch upload documents to Pinecone"""
vectors = []
for doc in documents:
# Generate embedding
embedding = openai.Embedding.create(
input=doc['text'],
model="text-embedding-3-small"
)['data'][0]['embedding']
# Prepare vector
vectors.append({
'id': doc['id'],
'values': embedding,
'metadata': {
'text': doc['text'],
'category': doc.get('category', ''),
'timestamp': doc.get('timestamp', '')
}
})
# Batch upsert
index.upsert(vectors=vectors, namespace="default")
# Search with metadata filtering
def semantic_search(query: str, top_k: int = 5, category: str = None):
"""Semantic search with optional filtering"""
# Generate query embedding
query_embedding = openai.Embedding.create(
input=query,
model="text-embedding-3-small"
)['data'][0]['embedding']
# Search with filter
filter_dict = {"category": category} if category else None
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=filter_dict,
namespace="default"
)
return results
# Usage
documents = [
{"id": "doc1", "text": "Python tutorial for beginners", "category": "tutorial"},
{"id": "doc2", "text": "Advanced ML techniques", "category": "advanced"},
{"id": "doc3", "text": "JavaScript basics", "category": "tutorial"}
]
add_documents(documents)
# Search
results = semantic_search("learning Python", top_k=3, category="tutorial")
for match in results.matches:
print(f"Score: {match.score:.3f} - {match.metadata['text']}")
Pattern 3: Local Vector Search with FAISS
CPU-Optimized Search
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
class LocalSemanticSearch:
def __init__(self, model_name='all-MiniLM-L6-v2'):
"""Initialize with open-source embedding model"""
self.model = SentenceTransformer(model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
self.index = None
self.documents = []
def build_index(self, documents: list[str]):
"""Build FAISS index from documents"""
self.documents = documents
# Generate embeddings
embeddings = self.model.encode(documents, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
# Create FAISS index
self.index = faiss.IndexFlatL2(self.dimension)
# Add vectors
self.index.add(embeddings)
print(f"Indexed {len(documents)} documents")
def search(self, query: str, top_k: int = 5):
"""Search for similar documents"""
# Generate query embedding
query_embedding = self.model.encode([query]).astype('float32')
# Search
distances, indices = self.index.search(query_embedding, top_k)
# Format results
results = []
for dist, idx in zip(distances[0], indices[0]):
results.append({
'document': self.documents[idx],
'distance': float(dist),
'similarity': 1 / (1 + dist) # Convert distance to similarity
})
return results
def save_index(self, filepath: str):
"""Save index to disk"""
faiss.write_index(self.index, filepath)
def load_index(self, filepath: str):
"""Load index from disk"""
self.index = faiss.read_index(filepath)
# Usage
search = LocalSemanticSearch()
documents = [
"How to train neural networks",
"Python programming basics",
"Deep learning with TensorFlow",
"JavaScript for web development",
"Machine learning algorithms overview"
]
search.build_index(documents)
# Search
results = search.search("learning AI", top_k=3)
for result in results:
print(f"Similarity: {result['similarity']:.3f}")
print(f"Document: {result['document']}\n")
# Save for later use
search.save_index("semantic_search.index")
Pattern 4: Hybrid Search (Keyword + Semantic)
Best of Both Worlds
from rank_bm25 import BM25Okapi
import numpy as np
class HybridSearch:
def __init__(self, embedding_model):
self.embedding_model = embedding_model
self.bm25 = None
self.documents = []
self.embeddings = None
def index(self, documents: list[str]):
"""Index documents for both keyword and semantic search"""
self.documents = documents
# BM25 for keyword search
tokenized_docs = [doc.lower().split() for doc in documents]
self.bm25 = BM25Okapi(tokenized_docs)
# Embeddings for semantic search
self.embeddings = self.embedding_model.encode(documents)
def search(self, query: str, top_k: int = 5, alpha: float = 0.5):
"""
Hybrid search combining keyword and semantic
alpha: weight for semantic search (0=keyword only, 1=semantic only)
"""
# Keyword search scores
tokenized_query = query.lower().split()
bm25_scores = self.bm25.get_scores(tokenized_query)
# Normalize BM25 scores
bm25_scores = bm25_scores / (np.max(bm25_scores) + 1e-6)
# Semantic search scores
query_embedding = self.embedding_model.encode([query])[0]
semantic_scores = np.dot(self.embeddings, query_embedding)
semantic_scores = semantic_scores / (np.max(semantic_scores) + 1e-6)
# Combine scores
hybrid_scores = (1 - alpha) * bm25_scores + alpha * semantic_scores
# Get top k
top_indices = np.argsort(hybrid_scores)[-top_k:][::-1]
results = []
for idx in top_indices:
results.append({
'document': self.documents[idx],
'hybrid_score': float(hybrid_scores[idx]),
'bm25_score': float(bm25_scores[idx]),
'semantic_score': float(semantic_scores[idx])
})
return results
# Usage
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
search = HybridSearch(model)
documents = [
"Python is a programming language used for web development",
"Machine learning enables computers to learn from data",
"Deep neural networks are the foundation of modern AI",
"JavaScript runs in web browsers for interactive sites"
]
search.index(documents)
# Try keyword-heavy search
results = search.search("Python programming", alpha=0.3) # More keyword weight
print("Keyword-heavy results:")
for r in results:
print(f" {r['document'][:50]}... (hybrid: {r['hybrid_score']:.2f})")
# Try semantic-heavy search
results = search.search("coding in Python", alpha=0.8) # More semantic weight
print("\nSemantic-heavy results:")
for r in results:
print(f" {r['document'][:50]}... (hybrid: {r['hybrid_score']:.2f})")
Production Best Practices
1. Batch Processing for Efficiency
def batch_embed_documents(documents: list[str], batch_size: int = 100):
"""Efficiently batch process documents"""
embeddings = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# Generate embeddings for batch
response = openai.Embedding.create(
input=batch,
model="text-embedding-3-small"
)
batch_embeddings = [item['embedding'] for item in response['data']]
embeddings.extend(batch_embeddings)
print(f"Processed {min(i + batch_size, len(documents))}/{len(documents)} documents")
return embeddings
2. Caching for Performance
import hashlib
import json
from functools import lru_cache
class EmbeddingCache:
def __init__(self, cache_file='embeddings_cache.json'):
self.cache_file = cache_file
self.cache = self.load_cache()
def load_cache(self):
try:
with open(self.cache_file, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {}
def save_cache(self):
with open(self.cache_file, 'w') as f:
json.dump(self.cache, f)
def get_cache_key(self, text: str, model: str) -> str:
"""Generate cache key from text and model"""
return hashlib.md5(f"{text}|{model}".encode()).hexdigest()
def get_embedding(self, text: str, model: str = "text-embedding-3-small"):
"""Get embedding with caching"""
cache_key = self.get_cache_key(text, model)
# Check cache
if cache_key in self.cache:
return self.cache[cache_key]
# Generate embedding
embedding = openai.Embedding.create(
input=text,
model=model
)['data'][0]['embedding']
# Cache result
self.cache[cache_key] = embedding
self.save_cache()
return embedding
# Usage
cache = EmbeddingCache()
embedding = cache.get_embedding("Sample text") # First call: API request
embedding = cache.get_embedding("Sample text") # Second call: from cache
3. Monitoring and Metrics
import time
from dataclasses import dataclass
from typing import List
@dataclass
class SearchMetrics:
query: str
num_results: int
latency_ms: float
top_score: float
timestamp: float
class MonitoredSemanticSearch:
def __init__(self):
self.metrics: List[SearchMetrics] = []
def search(self, query: str, top_k: int = 5):
"""Search with metrics"""
start_time = time.time()
# Perform search (your implementation here)
results = self._execute_search(query, top_k)
# Record metrics
latency = (time.time() - start_time) * 1000
self.metrics.append(SearchMetrics(
query=query,
num_results=len(results),
latency_ms=latency,
top_score=results[0]['score'] if results else 0,
timestamp=time.time()
))
return results
def get_performance_report(self):
"""Generate performance report"""
if not self.metrics:
return "No metrics available"
latencies = [m.latency_ms for m in self.metrics]
scores = [m.top_score for m in self.metrics]
return {
'total_searches': len(self.metrics),
'avg_latency_ms': sum(latencies) / len(latencies),
'p95_latency_ms': sorted(latencies)[int(len(latencies) * 0.95)],
'avg_top_score': sum(scores) / len(scores),
'min_top_score': min(scores)
}
Real-World Example: Document Q&A System
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
class DocumentQA:
def __init__(self):
self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
self.vectorstore = None
self.qa_chain = None
def load_documents(self, documents: list[str]):
"""Load and index documents"""
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.create_documents(documents)
# Create vector store
self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
# Create QA chain
self.qa_chain = RetrievalQA.from_chain_type(
llm=OpenAI(model="gpt-3.5-turbo"),
retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True
)
def ask(self, question: str):
"""Ask question about documents"""
result = self.qa_chain({"query": question})
return {
'answer': result['result'],
'sources': [doc.page_content for doc in result['source_documents']]
}
# Usage
qa = DocumentQA()
documents = [
"Python is a high-level programming language. It's known for readability.",
"Machine learning is a subset of AI that learns from data patterns.",
"Neural networks are inspired by biological neurons in the brain."
]
qa.load_documents(documents)
result = qa.ask("What is Python?")
print(f"Answer: {result['answer']}")
print(f"Sources: {result['sources']}")
Common Pitfalls
- Not chunking long documents: Embeddings work best on coherent chunks (100-500 words)
- Ignoring metadata: Add filters for category, date, author to improve relevance
- Poor query preprocessing: Clean and normalize queries before embedding
- No embedding cache: Cache embeddings to reduce API costs
- Wrong similarity metric: Cosine similarity usually best for text
- Forgetting to normalize: Normalize vectors before comparison
Key Takeaways
- Use OpenAI embeddings for quick start, sentence-transformers for cost-effective local
- Pinecone for production-scale cloud, FAISS for local/self-hosted
- Hybrid search (keyword + semantic) often outperforms either alone
- Chunk documents into 100-500 word segments
- Cache embeddings to reduce costs
- Monitor search quality and latency in production
- Add metadata filtering for better results
Semantic search with embeddings transforms how users find information. By understanding meaning rather than
matching keywords, you create search experiences that feel intelligent and intuitive—exactly what users expect
from modern applications.
Discover more from C4: Container, Code, Cloud & Context
Subscribe to get the latest posts sent to your email.