semantic_search.py
"""
Semantic Search Engine with FAISS + Sentence Transformers
=========================================================
Builds a fully local semantic search engine: embed documents,
index with FAISS, query with natural language, and visualize
the embedding space.
Requirements:
pip install sentence-transformers faiss-cpu numpy rich matplotlib scikit-learn
Usage:
python semantic_search.py
"""
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
console = Console()
# ═══════════════════════════════════════════════════════════════
# 1. DOCUMENT CORPUS — 140 documents across 7 categories
# ═══════════════════════════════════════════════════════════════
console.print(Panel("[bold cyan]Step 1: Preparing Document Corpus[/bold cyan]", border_style="blue"))
documents = [
# Technology & Programming (20 docs)
"Python is a high-level programming language known for its readability and simplicity",
"JavaScript runs in web browsers and enables interactive web pages and dynamic content",
"Docker containers package applications with their dependencies for consistent deployment",
"Git is a distributed version control system that tracks changes in source code during development",
"REST APIs use HTTP methods like GET, POST, PUT, and DELETE to interact with web resources",
"SQL databases like PostgreSQL use structured tables with rows and columns to store data",
"Linux is an open-source operating system kernel used in servers, desktops, and embedded devices",
"Machine learning models learn patterns from training data to make predictions on new inputs",
"React is a JavaScript library for building user interfaces using reusable components",
"Kubernetes orchestrates containerized applications across clusters of machines automatically",
"TypeScript adds static type checking to JavaScript, catching errors before runtime",
"Redis is an in-memory data structure store used as a database, cache, and message broker",
"GraphQL is a query language for APIs that lets clients request exactly the data they need",
"CSS Grid and Flexbox are modern layout systems for building responsive web designs",
"The command line interface allows developers to interact with the operating system via text",
"Neural networks are computing systems inspired by biological neurons in the human brain",
"Blockchain is a distributed ledger technology that underlies cryptocurrencies like Bitcoin",
"DevOps combines software development and IT operations to shorten the development lifecycle",
"Cloud computing provides on-demand access to computing resources over the internet",
"Agile methodology emphasizes iterative development, collaboration, and responding to change",
# Science & Nature (20 docs)
"Photosynthesis is the process by which plants convert sunlight into chemical energy",
"The Earth orbits the Sun at an average distance of about 93 million miles",
"DNA molecules contain the genetic instructions for the development of all living organisms",
"Black holes are regions of spacetime where gravity is so strong that nothing can escape",
"The water cycle describes how water evaporates, forms clouds, and returns as precipitation",
"Mitochondria are organelles that generate most of the chemical energy needed by cells",
"Plate tectonics explains how Earth's crust moves, causing earthquakes and volcanic activity",
"The speed of light in a vacuum is approximately 299,792 kilometers per second",
"Evolution by natural selection explains how species adapt to their environments over time",
"Quantum mechanics describes the behavior of matter and energy at the atomic scale",
"The human brain contains approximately 86 billion neurons connected by trillions of synapses",
"Climate change refers to long-term shifts in global temperatures and weather patterns",
"Antibodies are proteins produced by the immune system to neutralize harmful pathogens",
"The periodic table organizes chemical elements by their atomic number and properties",
"Gravity is the force that attracts objects with mass toward each other",
"Coral reefs are diverse underwater ecosystems built by colonies of tiny marine animals",
"The Amazon rainforest produces about 20 percent of the world's oxygen supply",
"Volcanoes form when magma from inside the Earth erupts through the crust",
"Enzymes are biological catalysts that speed up chemical reactions in living organisms",
"The universe is approximately 13.8 billion years old according to current estimates",
# Cooking & Food (20 docs)
"Pasta carbonara is an Italian dish made with eggs, cheese, pancetta, and black pepper",
"Sourdough bread uses naturally occurring wild yeast and bacteria for fermentation",
"Chocolate chip cookies should be baked until the edges are golden but the center is soft",
"A roux is a mixture of flour and fat cooked together to thicken sauces and soups",
"Sushi is a Japanese dish of vinegared rice combined with raw fish and vegetables",
"Fermentation transforms food through the action of microorganisms like bacteria and yeast",
"Cast iron skillets retain heat well and develop a natural non-stick seasoning over time",
"Mise en place means preparing and organizing all ingredients before starting to cook",
"Slow cooking breaks down tough connective tissue in meat, making it tender and flavorful",
"Baking requires precise measurements because it involves complex chemical reactions",
"Olive oil is a cornerstone of Mediterranean cuisine, used for cooking and dressing",
"The Maillard reaction creates brown crusts and complex flavors when proteins are heated",
"Curry is a broad term for spiced dishes originating from South Asian cuisines",
"A proper steak should rest for several minutes after cooking to retain its juices",
"Vanilla extract is made by soaking vanilla beans in alcohol to extract their flavor",
"Kimchi is a traditional Korean fermented vegetable dish, typically made with napa cabbage",
"French onion soup is topped with melted cheese over a crusty bread crouton",
"Pressure cookers use trapped steam to cook food faster at higher temperatures",
"Braising involves first searing meat at high heat, then cooking it slowly in liquid",
"Gelato is an Italian frozen dessert that is denser and has less fat than ice cream",
# Travel & Geography (20 docs)
"The Great Wall of China stretches over 13,000 miles across northern China",
"Tokyo is the most populous metropolitan area in the world with over 37 million residents",
"The Sahara Desert is the largest hot desert on Earth, covering most of North Africa",
"Venice is built on more than 100 small islands connected by canals and bridges",
"The Northern Lights are caused by solar particles interacting with Earth's magnetic field",
"Machu Picchu is a 15th-century Inca citadel located high in the Andes Mountains in Peru",
"Iceland has over 130 volcanoes and numerous geothermal hot springs used for bathing",
"The Great Barrier Reef is the world's largest coral reef system off the coast of Australia",
"Paris is known as the City of Light and is home to the Eiffel Tower and the Louvre Museum",
"The Amazon River is the largest river by water volume, flowing through South America",
"Bali is an Indonesian island known for its terraced rice paddies and Hindu temples",
"The Serengeti hosts one of the largest wildlife migrations with millions of wildebeest",
"New Zealand's landscape features fjords, mountains, beaches, and geothermal areas",
"The Pyramids of Giza were built as tombs for Egyptian pharaohs over 4,500 years ago",
"Banff National Park in Canada features turquoise glacial lakes in the Rocky Mountains",
"The Dead Sea is so salty that swimmers float effortlessly on its surface",
"Mount Everest, at 29,032 feet, is the highest point on Earth above sea level",
"The Grand Canyon was carved by the Colorado River over millions of years",
"Petra is an ancient city in Jordan carved directly into red sandstone cliffs",
"Antarctica is the coldest continent with temperatures dropping below minus 80 degrees Celsius",
# Health & Fitness (20 docs)
"Regular cardiovascular exercise strengthens the heart and improves blood circulation",
"A balanced diet includes fruits, vegetables, whole grains, lean proteins, and healthy fats",
"Getting seven to nine hours of quality sleep each night is essential for cognitive function",
"Yoga combines physical postures, breathing techniques, and meditation for overall wellness",
"Staying hydrated helps regulate body temperature and transport nutrients throughout the body",
"Strength training builds muscle mass and increases bone density, reducing injury risk",
"Meditation reduces stress by helping practitioners focus on the present moment",
"High-intensity interval training alternates short bursts of intense exercise with rest periods",
"Vitamin D is produced by the body when skin is exposed to sunlight",
"Stretching improves flexibility and can help prevent injuries during physical activity",
"Protein is essential for building and repairing muscle tissue after exercise",
"Walking 10,000 steps per day is a common guideline for maintaining basic fitness",
"Chronic stress can lead to high blood pressure, anxiety, and weakened immune function",
"The Mediterranean diet emphasizes plant-based foods, olive oil, fish, and moderate wine",
"Proper form during weightlifting is more important than lifting heavy weights",
"Circuit training moves quickly between different exercises for a full-body workout",
"Foam rolling helps release muscle tension and improve recovery after workouts",
"Mindfulness involves paying deliberate attention to the present experience without judgment",
"Core exercises like planks strengthen the abdominal and back muscles for better posture",
"Overtraining syndrome occurs when exercise intensity exceeds the body's recovery capacity",
# Business & Finance (20 docs)
"Compound interest allows investments to grow exponentially over long periods of time",
"Diversification spreads investment risk across different asset classes and sectors",
"A startup is a young company designed to scale rapidly, often in the technology sector",
"Supply and demand determine market prices in a competitive economy",
"A balance sheet shows a company's assets, liabilities, and shareholder equity at a point in time",
"Inflation reduces the purchasing power of money over time, affecting savings and wages",
"Marketing involves promoting products through advertising, branding, and customer outreach",
"Venture capital firms invest in early-stage companies with high growth potential",
"A budget helps individuals and businesses track income and expenses to meet financial goals",
"The stock market enables companies to raise capital by selling shares to public investors",
"Cryptocurrencies use cryptographic techniques to enable secure decentralized transactions",
"Customer relationship management systems help businesses track interactions with clients",
"Supply chain management coordinates the flow of goods from suppliers to end customers",
"An initial public offering marks the first time a company's stock is sold to the public",
"Freelancing allows professionals to work independently for multiple clients on their own terms",
"Gross domestic product measures the total value of goods and services produced by a country",
"A recession is a significant decline in economic activity lasting more than a few months",
"Leadership skills include clear communication, empathy, decision-making, and delegation",
"E-commerce allows businesses to sell products and services directly to customers online",
"A 401k is a retirement savings plan sponsored by employers in the United States",
# Arts & Culture (20 docs)
"The Renaissance was a period of great artistic and intellectual achievement in Europe",
"Photography captures light on a sensor or film to create lasting visual images",
"Jazz music originated in African American communities in New Orleans in the early 1900s",
"Impressionist painters like Monet used loose brushstrokes to capture the effects of light",
"Cinema uses moving images and sound to tell stories and evoke emotions in audiences",
"The novel is a long-form work of fiction that explores characters and narrative in depth",
"Ballet is a highly technical form of dance that originated in Italian Renaissance courts",
"Architecture blends art and engineering to design buildings that are both beautiful and functional",
"Hip hop culture emerged in the Bronx during the 1970s and includes rap, DJing, and breakdancing",
"Pottery involves shaping clay into vessels and hardening them through high-temperature firing",
"Theater performers bring characters to life on stage through dialogue, movement, and emotion",
"Digital art uses computer technology as an essential part of the creative process",
"Opera combines vocal performance, orchestral music, and theatrical staging",
"Graffiti art transforms public walls and surfaces into canvases for personal expression",
"The symphony orchestra consists of strings, woodwinds, brass, and percussion sections",
"Mosaic art creates images by assembling small pieces of colored glass, stone, or tile",
"Calligraphy is the art of beautiful handwriting using brushes or specialized pens",
"Animation creates the illusion of movement by displaying a rapid sequence of images",
"The blues is a musical genre that expresses deep emotion through distinctive chord progressions",
"Abstract art uses shapes, colors, and forms to achieve its effect rather than realistic depiction",
]
console.print(f"[green]Loaded {len(documents)} documents across 7 categories[/green]")
# ═══════════════════════════════════════════════════════════════
# 2. GENERATE EMBEDDINGS
# ═══════════════════════════════════════════════════════════════
console.print(f"\n[bold cyan]Step 2: Generating Embeddings[/bold cyan]")
model_name = "all-MiniLM-L6-v2"
console.print(f"Loading model: [yellow]{model_name}[/yellow]...")
model = SentenceTransformer(model_name)
console.print("Encoding documents...")
with Progress(
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
BarColumn(), console=console,
) as progress:
task = progress.add_task("[cyan]Generating embeddings...", total=len(documents))
embeddings = model.encode(
documents, show_progress_bar=False,
convert_to_numpy=True, normalize_embeddings=True,
)
progress.update(task, completed=len(documents))
console.print(f"[green]{len(embeddings)} embeddings generated, shape: {embeddings.shape}[/green]")
# ═══════════════════════════════════════════════════════════════
# 3. BUILD FAISS INDEX
# ═══════════════════════════════════════════════════════════════
console.print(f"\n[bold cyan]Step 3: Building FAISS Index[/bold cyan]")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner Product = cosine similarity (vectors are L2-normalized)
index.add(embeddings.astype(np.float32))
console.print(f"[green]FAISS index built: {index.ntotal} vectors, {index.d} dimensions[/green]")
# ═══════════════════════════════════════════════════════════════
# 4. SEMANTIC SEARCH FUNCTION
# ═══════════════════════════════════════════════════════════════
def semantic_search(query: str, top_k: int = 5):
"""Search for documents semantically similar to the query."""
query_embedding = model.encode(
[query], convert_to_numpy=True, normalize_embeddings=True,
).astype(np.float32)
scores, indices = index.search(query_embedding, top_k)
results = []
for score, idx in zip(scores[0], indices[0]):
results.append({
"score": float(score),
"similarity_pct": f"{score * 100:.1f}%",
"document": documents[idx],
"index": int(idx),
})
return results
def display_search_results(query: str, results: list):
"""Pretty-print search results in a Rich table."""
console.print(f"\n[bold]Query:[/bold] [cyan]{query}[/cyan]\n")
table = Table(show_header=True, header_style="bold white")
table.add_column("#", style="dim", width=3)
table.add_column("Score", style="green", width=8)
table.add_column("Document", style="white", max_width=90)
for i, r in enumerate(results, 1):
score_val = r["score"]
score_style = "green" if score_val > 0.5 else "yellow" if score_val > 0.3 else "dim"
table.add_row(str(i), f"[{score_style}]{r['similarity_pct']}[/{score_style}]", r["document"][:90])
console.print(table)
# ═══════════════════════════════════════════════════════════════
# 5. DEMO: SEMANTIC vs KEYWORD SEARCH
# ═══════════════════════════════════════════════════════════════
console.print(f"\n[bold cyan]Step 4: Semantic Search Demo[/bold cyan]")
queries = [
"How do I make pasta at home?",
"What causes earthquakes and volcanic eruptions?",
"Tell me about investing and saving money",
"Best places to visit in Asia",
"How to stay healthy and fit",
"I want to learn web development",
"What is the theory of evolution?",
]
for query in queries:
results = semantic_search(query, top_k=3)
display_search_results(query, results)
# Keyword vs Semantic comparison
console.print(f"\n[bold cyan]Step 5: Semantic vs Keyword Comparison[/bold cyan]")
def keyword_search(query: str, top_k: int = 5):
"""Naive keyword search — counts overlapping words."""
query_words = set(query.lower().split())
scored = [(len(query_words & set(doc.lower().split())), i, doc) for i, doc in enumerate(documents)]
scored.sort(reverse=True)
return [{"index": idx, "document": doc, "keyword_matches": score}
for score, idx, doc in scored[:top_k] if score > 0]
sample_query = "How do I create and deploy web applications?"
console.print(f"\n[bold]Query:[/bold] [cyan]{sample_query}[/cyan]")
console.print("\n[yellow]Keyword Search (word overlap):[/yellow]")
for i, r in enumerate(keyword_search(sample_query), 1):
console.print(f" {i}. (matches: {r['keyword_matches']}) {r['document'][:80]}")
console.print("\n[green]Semantic Search (meaning similarity):[/green]")
for i, r in enumerate(semantic_search(sample_query, top_k=5), 1):
console.print(f" {i}. ({r['similarity_pct']}) {r['document'][:80]}")
# ═══════════════════════════════════════════════════════════════
# 6. VISUALIZE EMBEDDINGS WITH PCA
# ═══════════════════════════════════════════════════════════════
console.print(f"\n[bold cyan]Step 6: Visualizing Embeddings with PCA[/bold cyan]")
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(embeddings)
categories = ["Tech", "Science", "Cooking", "Travel", "Health", "Business", "Arts"]
colors = ["#3b82f6", "#10b981", "#f59e0b", "#8b5cf6", "#ef4444", "#06b6d4", "#ec4899"]
fig, ax = plt.subplots(figsize=(14, 10))
for i, cat in enumerate(categories):
mask = [j // 20 == i for j in range(len(documents))]
ax.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
c=colors[i], label=cat, alpha=0.7, s=50, edgecolors='white', linewidth=0.5)
ax.set_title("Document Embeddings Visualized with PCA\n384-dimensional vectors → 2D projection",
fontsize=14, fontweight='bold', pad=15)
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)", fontsize=11)
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)", fontsize=11)
ax.legend(loc='upper right', framealpha=0.9, fontsize=10)
ax.grid(True, alpha=0.2)
plt.tight_layout()
plt.savefig('embedding_visualization.png', dpi=150, bbox_inches='tight')
console.print("[green]Visualization saved to embedding_visualization.png[/green]")
# ═══════════════════════════════════════════════════════════════
# 7. SEARCH SPEED BENCHMARK
# ═══════════════════════════════════════════════════════════════
console.print(f"\n[bold cyan]Step 7: Search Speed Benchmark[/bold cyan]")
bench_queries = [
"machine learning and artificial intelligence",
"best pasta recipes from Italy",
"places to visit in Europe",
"how to invest in the stock market",
"benefits of regular exercise",
]
console.print("Running 1,000 searches...")
t0 = time.time()
for _ in range(200):
for q in bench_queries:
_ = semantic_search(q, top_k=5)
total_time = time.time() - t0
avg_time = (total_time / 1000) * 1000 # ms
console.print(f"[green]1,000 searches in {total_time:.2f}s[/green]")
console.print(f" Average: [yellow]{avg_time:.2f} ms[/yellow] per query")
console.print(f" Throughput: [yellow]{1000/total_time:.0f}[/yellow] queries/sec")
console.print(f"\n[bold green]{'='*60}[/bold green]")
console.print("[bold green]Semantic Search Engine — Build Complete![/bold green]")