Files
gh-menoncello-menon-market-…/skills/deep-research/scripts/web-researcher.ts
2025-11-30 08:39:54 +08:00

280 lines
8.3 KiB
TypeScript

#!/usr/bin/env bun
/**
* Web Researcher - Web-based research automation
* Handles web search and content extraction for research tasks
*/
import { randomUUID } from 'node:crypto';
interface ResearchOptions {
query: string;
depth?: 'quick' | 'comprehensive' | 'recent';
maxResults?: number;
sources?: string[];
excludeDomains?: string[];
}
interface SearchResult {
title: string;
snippet: string;
url?: string;
content?: string;
source: string;
relevanceScore: number;
publishDate?: string;
}
/**
* Web Researcher class for automating web-based research
*/
class WebResearcher {
private readonly userAgent = 'Mozilla/5.0 (compatible; ResearchBot/1.0)';
private readonly maxRetries = 3;
private readonly requestDelay = 1000; // 1 second between requests
/**
* Performs web research based on the provided options
*
* @param options - Research configuration options
* @returns Array of search results
*/
async performResearch(options: ResearchOptions): Promise<SearchResult[]> {
try {
// Simulate web search results
const results = await this.simulateWebSearch(options);
// Filter and rank results
const filteredResults = this.filterResults(results, options);
const rankedResults = this.rankResults(filteredResults);
return rankedResults;
} catch (error) {
console.error('❌ Research failed:', error);
throw error;
}
}
/**
* Simulates web search functionality
* In a real implementation, this would use search APIs or web scraping
*
* @param options - Research options
* @returns Simulated search results
*/
private async simulateWebSearch(options: ResearchOptions): Promise<SearchResult[]> {
// Simulate API delay
await this.delay(500);
const mockResults: SearchResult[] = [
{
title: `${options.query} - Wikipedia`,
snippet: `Comprehensive information about ${options.query} including history, operations, and key facts.`,
url: `https://en.wikipedia.org/wiki/${encodeURIComponent(options.query)}`,
source: 'Wikipedia',
relevanceScore: 0.9,
publishDate: new Date().toISOString(),
},
{
title: `${options.query} Official Website`,
snippet: `Official information about ${options.query}, including company profile, leadership, and recent news.`,
url: `https://www.${options.query.toLowerCase().replace(/\s+/g, '')}.com`,
source: 'Official Website',
relevanceScore: 0.95,
publishDate: new Date().toISOString(),
},
{
title: `${options.query} - Crunchbase Profile`,
snippet: `Funding information, investors, and company details for ${options.query}.`,
url: `https://www.crunchbase.com/organization/${options.query.toLowerCase().replace(/\s+/g, '-')}`,
source: 'Crunchbase',
relevanceScore: 0.85,
publishDate: new Date().toISOString(),
},
{
title: `${options.query} Financial Performance`,
snippet: `Latest financial results, revenue data, and market performance for ${options.query}.`,
url: `https://finance.yahoo.com/quote/${options.query.toUpperCase()}`,
source: 'Yahoo Finance',
relevanceScore: 0.8,
publishDate: new Date().toISOString(),
},
{
title: `${options.query} Industry Analysis`,
snippet: `Market position, competitors, and industry analysis for ${options.query}.`,
url: `https://www.industry-analysis.com/${options.query.toLowerCase().replace(/\s+/g, '-')}`,
source: 'Industry Analysis',
relevanceScore: 0.75,
publishDate: new Date().toISOString(),
},
];
// Return subset based on maxResults
const maxResults = options.maxResults || 10;
return mockResults.slice(0, Math.min(maxResults, mockResults.length));
}
/**
* Filters search results based on research options
*
* @param results - Raw search results
* @param options - Research options
* @returns Filtered results
*/
private filterResults(results: SearchResult[], options: ResearchOptions): SearchResult[] {
let filtered = [...results];
// Filter by excluded domains
if (options.excludeDomains && options.excludeDomains.length > 0) {
filtered = filtered.filter(result => {
const url = result.url || '';
return !options.excludeDomains!.some(domain => url.includes(domain));
});
}
// Filter by specific sources
if (options.sources && options.sources.length > 0) {
filtered = filtered.filter(result =>
options.sources!.some(source =>
result.source.toLowerCase().includes(source.toLowerCase())
)
);
}
// Filter by relevance score
const minRelevance = this.getMinRelevanceScore(options.depth);
filtered = filtered.filter(result => result.relevanceScore >= minRelevance);
return filtered;
}
/**
* Ranks search results by relevance and other factors
*
* @param results - Filtered search results
* @returns Ranked search results
*/
private rankResults(results: SearchResult[]): SearchResult[] {
return results.sort((a, b) => {
// Primary sort: relevance score
if (b.relevanceScore !== a.relevanceScore) {
return b.relevanceScore - a.relevanceScore;
}
// Secondary sort: publish date (more recent first)
if (a.publishDate && b.publishDate) {
return new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime();
}
// Tertiary sort: source authority
const sourceAuthority = this.getSourceAuthorityScore(a.source) - this.getSourceAuthorityScore(b.source);
if (sourceAuthority !== 0) {
return sourceAuthority;
}
return 0;
});
}
/**
* Gets the minimum relevance score based on research depth
*
* @param depth - Research depth level
* @returns Minimum relevance score threshold
*/
private getMinRelevanceScore(depth?: string): number {
switch (depth) {
case 'comprehensive':
return 0.6;
case 'recent':
return 0.7;
case 'quick':
default:
return 0.8;
}
}
/**
* Gets source authority score for ranking
*
* @param source - Source name
* @returns Authority score
*/
private getSourceAuthorityScore(source: string): number {
const authorityScores: Record<string, number> = {
'Official Website': 10,
'Wikipedia': 9,
'Reuters': 9,
'Bloomberg': 9,
'Yahoo Finance': 8,
'Crunchbase': 8,
'Industry Analysis': 7,
'News': 6,
};
return authorityScores[source] || 5;
}
/**
* Fetches full content from a URL
* In a real implementation, this would perform HTTP requests and content extraction
*
* @param url - URL to fetch content from
* @returns Extracted content
*/
async fetchContent(url: string): Promise<string> {
// Simulate content fetching
await this.delay(this.requestDelay);
// Return mock content
return `This is the extracted content from ${url}. In a real implementation, this would contain the actual web page content after processing and cleaning.`;
}
/**
* Closes the researcher and cleans up resources
*/
close(): void {
}
/**
* Utility function to create delays
*
* @param ms - Milliseconds to delay
* @returns Promise that resolves after the delay
*/
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Generates a unique research session ID
*
* @returns Unique session ID
*/
generateSessionId(): string {
return `research_${randomUUID()}`;
}
/**
* Validates research options
*
* @param options - Research options to validate
* @throws Error if options are invalid
*/
private validateOptions(options: ResearchOptions): void {
if (!options.query || options.query.trim().length === 0) {
throw new Error('Query is required and cannot be empty');
}
if (options.maxResults && (options.maxResults < 1 || options.maxResults > 100)) {
throw new Error('Max results must be between 1 and 100');
}
if (options.depth && !['quick', 'comprehensive', 'recent'].includes(options.depth)) {
throw new Error('Depth must be one of: quick, comprehensive, recent');
}
}
}
export { WebResearcher, type ResearchOptions, type SearchResult };