Project: Autonomous Research Agent
Build a production-ready autonomous research agent that combines web search, content extraction, analysis, and report generation to conduct comprehensive research on any topic.
Project Goal: Create a fully autonomous agent that can research any topic, gather information from multiple sources, analyze findings, and produce a well-structured research report.
Project Architecture
Our research agent will have these components:
┌─────────────────────────────────────────────────────────┐
│ Research Orchestrator │
└─────────────────────────────────────────────────────────┘
│
┌─────────────────┼─────────────────┐
▼ ▼ ▼
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Search │ │ Content │ │ Analysis │
│ Agent │ │ Extractor │ │ Agent │
└──────────────┘ └──────────────┘ └──────────────┘
│ │ │
└─────────────────┼─────────────────┘
▼
┌──────────────┐
│ Report │
│ Generator │
└──────────────┘
Step 1: Search Agent
First, implement web search capabilities:
import requests
from typing import List, Dict, Optional
from dataclasses import dataclass
import os
@dataclass
class SearchResult:
"""Represents a single search result."""
title: str
url: str
snippet: str
content: Optional[str] = None
class SearchAgent:
"""Agent for web searching."""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize search agent.
Args:
api_key: API key for search service (e.g., Google Custom Search, Bing)
"""
self.api_key = api_key or os.getenv("SEARCH_API_KEY")
self.search_engine_id = os.getenv("SEARCH_ENGINE_ID")
def search(self, query: str, num_results: int = 10) -> List[SearchResult]:
"""
Search the web for a query.
Args:
query: Search query
num_results: Number of results to return
Returns:
List of search results
"""
print(f"\n🔍 Searching for: {query}")
# Using Google Custom Search API
url = "https://www.googleapis.com/customsearch/v1"
params = {
"key": self.api_key,
"cx": self.search_engine_id,
"q": query,
"num": min(num_results, 10) # API limit
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
results = []
for item in data.get("items", []):
result = SearchResult(
title=item.get("title", ""),
url=item.get("link", ""),
snippet=item.get("snippet", "")
)
results.append(result)
print(f"✅ Found {len(results)} results")
return results
except requests.RequestException as e:
print(f"❌ Search error: {e}")
return []
def multi_query_search(
self,
queries: List[str],
results_per_query: int = 5
) -> Dict[str, List[SearchResult]]:
"""
Execute multiple search queries.
Args:
queries: List of search queries
results_per_query: Results per query
Returns:
Dictionary mapping queries to results
"""
all_results = {}
for query in queries:
results = self.search(query, results_per_query)
all_results[query] = results
return all_results
def generate_search_queries(
self,
topic: str,
num_queries: int = 3,
model: str = "gpt-4"
) -> List[str]:
"""
Generate diverse search queries for a topic using LLM.
Args:
topic: Research topic
num_queries: Number of queries to generate
model: LLM model to use
Returns:
List of search queries
"""
import openai
prompt = f"""Generate {num_queries} diverse search queries to research this topic comprehensively:
Topic: {topic}
Requirements:
1. Cover different aspects of the topic
2. Include recent developments (use year if relevant)
3. Target authoritative sources
4. Be specific and focused
Return only the queries, one per line."""
response = openai.ChatCompletion.create(
model=model,
messages=[
{"role": "user", "content": prompt}
],
temperature=0.7
)
queries_text = response.choices[0].message.content
queries = [q.strip() for q in queries_text.split("\n") if q.strip()]
return queries[:num_queries]
Step 2: Content Extractor
Extract and clean content from web pages:
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import trafilatura
from typing import Optional
import re
class ContentExtractor:
"""Extract and clean content from web pages."""
def __init__(self, timeout: int = 10):
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Research Bot)'
})
def extract(self, url: str) -> Optional[str]:
"""
Extract main content from a URL.
Args:
url: URL to extract content from
Returns:
Extracted text content or None
"""
print(f"\n📄 Extracting content from: {url}")
try:
# Fetch page
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
# Use trafilatura for content extraction (best for articles)
content = trafilatura.extract(response.text)
if content:
# Clean and process
content = self._clean_content(content)
print(f"✅ Extracted {len(content)} characters")
return content
# Fallback to BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'footer', 'header']):
element.decompose()
# Extract text
text = soup.get_text()
text = self._clean_content(text)
print(f"✅ Extracted {len(text)} characters (fallback)")
return text
except Exception as e:
print(f"❌ Extraction error: {e}")
return None
def _clean_content(self, text: str) -> str:
"""Clean extracted content."""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove very short lines (likely navigation/UI text)
lines = text.split('\n')
lines = [line.strip() for line in lines if len(line.strip()) > 20]
return '\n\n'.join(lines)
def extract_batch(
self,
results: List[SearchResult],
max_content_length: int = 5000
) -> List[SearchResult]:
"""
Extract content for multiple search results.
Args:
results: List of search results
max_content_length: Maximum content length per page
Returns:
Search results with content added
"""
for result in results:
content = self.extract(result.url)
if content:
# Truncate if too long
if len(content) > max_content_length:
content = content[:max_content_length] + "..."
result.content = content
return results
Step 3: Analysis Agent
Analyze and synthesize information:
import openai
from typing import List, Dict, Any
class AnalysisAgent:
"""Agent for analyzing and synthesizing research."""
def __init__(self, model: str = "gpt-4"):
self.model = model
def analyze_source(
self,
content: str,
research_question: str
) -> Dict[str, Any]:
"""
Analyze a single source.
Args:
content: Source content
research_question: Question being researched
Returns:
Analysis results
"""
prompt = f"""Analyze this source in relation to the research question.
Research Question: {research_question}
Source Content:
{content[:3000]}
Provide:
1. Key findings relevant to the research question
2. Important facts or data
3. Credibility assessment
4. Relevance score (1-10)
Format as JSON."""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a research analyst."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
# Parse response (simplified - should use json mode)
import json
try:
analysis = json.loads(response.choices[0].message.content)
except:
analysis = {
"key_findings": response.choices[0].message.content,
"relevance_score": 5
}
return analysis
def synthesize_findings(
self,
analyses: List[Dict[str, Any]],
research_question: str
) -> str:
"""
Synthesize multiple source analyses.
Args:
analyses: List of source analyses
research_question: Research question
Returns:
Synthesized findings
"""
# Sort by relevance
sorted_analyses = sorted(
analyses,
key=lambda x: x.get("relevance_score", 0),
reverse=True
)
# Create synthesis prompt
findings_text = "\n\n".join([
f"Source {i+1}:\n{analysis.get('key_findings', 'No findings')}"
for i, analysis in enumerate(sorted_analyses[:10])
])
prompt = f"""Synthesize these research findings into a comprehensive answer.
Research Question: {research_question}
Findings from Sources:
{findings_text}
Create a well-organized synthesis that:
1. Answers the research question comprehensively
2. Integrates findings from multiple sources
3. Identifies patterns and themes
4. Notes any contradictions or uncertainties
5. Provides evidence-based conclusions"""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a research synthesizer."},
{"role": "user", "content": prompt}
],
temperature=0.5
)
return response.choices[0].message.content
def identify_gaps(
self,
synthesis: str,
research_question: str
) -> List[str]:
"""
Identify gaps in research that need more investigation.
Args:
synthesis: Current synthesis
research_question: Research question
Returns:
List of follow-up questions
"""
prompt = f"""Identify gaps or unanswered questions in this research.
Research Question: {research_question}
Current Synthesis:
{synthesis}
What important aspects need more investigation?
Provide 3-5 specific follow-up questions."""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{"role": "user", "content": prompt}
],
temperature=0.7
)
questions_text = response.choices[0].message.content
questions = [
q.strip().lstrip('0123456789.-) ')
for q in questions_text.split('\n')
if q.strip()
]
return questions[:5]
Step 4: Report Generator
Generate comprehensive research reports:
from datetime import datetime
from typing import List, Dict
class ReportGenerator:
"""Generate research reports."""
def __init__(self, model: str = "gpt-4"):
self.model = model
def generate_report(
self,
research_question: str,
synthesis: str,
sources: List[SearchResult],
metadata: Optional[Dict] = None
) -> str:
"""
Generate a comprehensive research report.
Args:
research_question: The research question
synthesis: Synthesized findings
sources: Source materials
metadata: Additional metadata
Returns:
Formatted report
"""
# Prepare sources section
sources_text = self._format_sources(sources)
# Generate report
prompt = f"""Create a comprehensive research report.
Research Question: {research_question}
Synthesized Findings:
{synthesis}
Sources Used:
{sources_text}
Create a professional report with:
1. Executive Summary
2. Introduction
3. Key Findings (organized by themes)
4. Analysis and Discussion
5. Conclusions
6. Sources/References
Use markdown formatting."""
response = openai.ChatCompletion.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a research report writer."},
{"role": "user", "content": prompt}
],
temperature=0.5,
max_tokens=2000
)
report = response.choices[0].message.content
# Add metadata
report = self._add_metadata(report, research_question, metadata)
return report
def _format_sources(self, sources: List[SearchResult]) -> str:
"""Format sources for inclusion in report."""
formatted = []
for i, source in enumerate(sources[:20], 1):
formatted.append(
f"{i}. {source.title}\n URL: {source.url}"
)
return "\n".join(formatted)
def _add_metadata(
self,
report: str,
research_question: str,
metadata: Optional[Dict]
) -> str:
"""Add metadata header to report."""
header = f"""# Research Report
**Research Question:** {research_question}
**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
**Agent Version:** 1.0
---
"""
return header + report
def export_report(
self,
report: str,
filename: str,
format: str = "markdown"
):
"""
Export report to file.
Args:
report: Report content
filename: Output filename
format: Output format (markdown, html, pdf)
"""
if format == "markdown":
with open(filename, 'w', encoding='utf-8') as f:
f.write(report)
elif format == "html":
# Convert markdown to HTML
import markdown
html = markdown.markdown(report)
html_template = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Research Report</title>
<style>
body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 50px auto; line-height: 1.6; }}
h1, h2, h3 {{ color: #333; }}
code {{ background: #f4f4f4; padding: 2px 5px; }}
</style>
</head>
<body>
{html}
</body>
</html>
"""
with open(filename, 'w', encoding='utf-8') as f:
f.write(html_template)
print(f"✅ Report exported to: {filename}")
Step 5: Research Orchestrator
The main orchestrator that coordinates everything:
class ResearchOrchestrator:
"""Orchestrates the complete research process."""
def __init__(
self,
search_agent: SearchAgent,
content_extractor: ContentExtractor,
analysis_agent: AnalysisAgent,
report_generator: ReportGenerator
):
self.search_agent = search_agent
self.content_extractor = content_extractor
self.analysis_agent = analysis_agent
self.report_generator = report_generator
def research(
self,
topic: str,
depth: str = "standard",
max_iterations: int = 2
) -> str:
"""
Conduct autonomous research on a topic.
Args:
topic: Research topic/question
depth: Research depth (quick, standard, deep)
max_iterations: Number of research-gap-filling iterations
Returns:
Final research report
"""
print(f"\n{'='*60}")
print(f"AUTONOMOUS RESEARCH AGENT")
print(f"Topic: {topic}")
print(f"Depth: {depth}")
print(f"{'='*60}\n")
# Configure based on depth
config = self._get_depth_config(depth)
all_sources = []
iteration_syntheses = []
for iteration in range(max_iterations):
print(f"\n{'='*60}")
print(f"ITERATION {iteration + 1}")
print(f"{'='*60}\n")
# 1. Generate search queries
if iteration == 0:
queries = self.search_agent.generate_search_queries(
topic,
num_queries=config['queries_per_iteration']
)
else:
# Use gap analysis to generate new queries
gaps = self.analysis_agent.identify_gaps(
iteration_syntheses[-1],
topic
)
queries = gaps[:config['queries_per_iteration']]
print(f"\n📋 Search Queries:")
for i, query in enumerate(queries, 1):
print(f" {i}. {query}")
# 2. Search
search_results = self.search_agent.multi_query_search(
queries,
results_per_query=config['results_per_query']
)
# Flatten results
iteration_sources = []
for query_results in search_results.values():
iteration_sources.extend(query_results)
# 3. Extract content
print(f"\n📥 Extracting Content...")
iteration_sources = self.content_extractor.extract_batch(
iteration_sources,
max_content_length=config['max_content_length']
)
# Filter sources with content
iteration_sources = [s for s in iteration_sources if s.content]
all_sources.extend(iteration_sources)
print(f"✅ Extracted content from {len(iteration_sources)} sources")
# 4. Analyze sources
print(f"\n🔬 Analyzing Sources...")
analyses = []
for source in iteration_sources:
analysis = self.analysis_agent.analyze_source(
source.content,
topic
)
analyses.append(analysis)
# 5. Synthesize findings
print(f"\n🧬 Synthesizing Findings...")
synthesis = self.analysis_agent.synthesize_findings(
analyses,
topic
)
iteration_syntheses.append(synthesis)
print(f"\n📊 Iteration {iteration + 1} Complete")
print(f" Sources processed: {len(iteration_sources)}")
print(f" Total sources: {len(all_sources)}")
# 6. Generate final report
print(f"\n{'='*60}")
print(f"GENERATING FINAL REPORT")
print(f"{'='*60}\n")
final_synthesis = iteration_syntheses[-1]
report = self.report_generator.generate_report(
research_question=topic,
synthesis=final_synthesis,
sources=all_sources,
metadata={
'iterations': max_iterations,
'total_sources': len(all_sources),
'depth': depth
}
)
print(f"\n✅ Research Complete!")
return report
def _get_depth_config(self, depth: str) -> Dict:
"""Get configuration based on research depth."""
configs = {
"quick": {
"queries_per_iteration": 2,
"results_per_query": 3,
"max_content_length": 2000
},
"standard": {
"queries_per_iteration": 3,
"results_per_query": 5,
"max_content_length": 4000
},
"deep": {
"queries_per_iteration": 5,
"results_per_query": 8,
"max_content_length": 6000
}
}
return configs.get(depth, configs["standard"])
Complete Usage Example
Put it all together:
# Initialize components
search_agent = SearchAgent()
content_extractor = ContentExtractor()
analysis_agent = AnalysisAgent(model="gpt-4")
report_generator = ReportGenerator(model="gpt-4")
# Create orchestrator
orchestrator = ResearchOrchestrator(
search_agent=search_agent,
content_extractor=content_extractor,
analysis_agent=analysis_agent,
report_generator=report_generator
)
# Conduct research
report = orchestrator.research(
topic="What are the latest developments in AI agent architectures?",
depth="standard",
max_iterations=2
)
# Export report
report_generator.export_report(
report=report,
filename="research_report.md",
format="markdown"
)
report_generator.export_report(
report=report,
filename="research_report.html",
format="html"
)
print("\n" + "="*60)
print("FINAL REPORT")
print("="*60 + "\n")
print(report)
Production Ready: This agent includes error handling, iterative refinement, gap analysis, and professional report generation suitable for real-world use.
Enhancements
Add Caching
import hashlib
import json
from pathlib import Path
class CachedSearchAgent(SearchAgent):
"""Search agent with caching."""
def __init__(self, cache_dir: str = "./cache", **kwargs):
super().__init__(**kwargs)
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def search(self, query: str, num_results: int = 10) -> List[SearchResult]:
"""Search with caching."""
# Generate cache key
cache_key = hashlib.md5(
f"{query}:{num_results}".encode()
).hexdigest()
cache_file = self.cache_dir / f"{cache_key}.json"
# Check cache
if cache_file.exists():
print(f"📦 Using cached results for: {query}")
with open(cache_file) as f:
data = json.load(f)
return [SearchResult(**item) for item in data]
# Perform search
results = super().search(query, num_results)
# Cache results
with open(cache_file, 'w') as f:
json.dump([vars(r) for r in results], f)
return results
Add Progress Tracking
from tqdm import tqdm
class ProgressTrackingOrchestrator(ResearchOrchestrator):
"""Orchestrator with progress tracking."""
def research(self, topic: str, depth: str = "standard", max_iterations: int = 2) -> str:
"""Research with progress bar."""
config = self._get_depth_config(depth)
total_queries = max_iterations * config['queries_per_iteration']
with tqdm(total=total_queries, desc="Research Progress") as pbar:
# Override progress updates in methods
return super().research(topic, depth, max_iterations)
Optimization: Cache search results and extracted content to avoid redundant API calls and speed up iterative research.
Key Takeaways
- Modular design - separate components for search, extraction, analysis, and reporting
- Iterative refinement - use gap analysis to guide follow-up research
- Error handling - gracefully handle API failures and extraction errors
- Caching - avoid redundant operations for efficiency
- Professional output - generate well-formatted, comprehensive reports
Quiz
Test your understanding of autonomous research agents: