gh-menoncello-menon-market-…/skills/deep-research/scripts/web-researcher.ts

#!/usr/bin/env bun

/**
 * Web Researcher - Web-based research automation
 * Handles web search and content extraction for research tasks
 */

import { randomUUID } from 'node:crypto';

interface ResearchOptions {
  query: string;
  depth?: 'quick' | 'comprehensive' | 'recent';
  maxResults?: number;
  sources?: string[];
  excludeDomains?: string[];
}

interface SearchResult {
  title: string;
  snippet: string;
  url?: string;
  content?: string;
  source: string;
  relevanceScore: number;
  publishDate?: string;
}

/**
 * Web Researcher class for automating web-based research
 */
class WebResearcher {
  private readonly userAgent = 'Mozilla/5.0 (compatible; ResearchBot/1.0)';
  private readonly maxRetries = 3;
  private readonly requestDelay = 1000; // 1 second between requests

  /**
   * Performs web research based on the provided options
   *
   * @param options - Research configuration options
   * @returns Array of search results
   */
  async performResearch(options: ResearchOptions): Promise<SearchResult[]> {

    try {
      // Simulate web search results
      const results = await this.simulateWebSearch(options);

      // Filter and rank results
      const filteredResults = this.filterResults(results, options);
      const rankedResults = this.rankResults(filteredResults);

      return rankedResults;
    } catch (error) {
      console.error('❌ Research failed:', error);
      throw error;
    }
  }

  /**
   * Simulates web search functionality
   * In a real implementation, this would use search APIs or web scraping
   *
   * @param options - Research options
   * @returns Simulated search results
   */
  private async simulateWebSearch(options: ResearchOptions): Promise<SearchResult[]> {
    // Simulate API delay
    await this.delay(500);

    const mockResults: SearchResult[] = [
      {
        title: `${options.query} - Wikipedia`,
        snippet: `Comprehensive information about ${options.query} including history, operations, and key facts.`,
        url: `https://en.wikipedia.org/wiki/${encodeURIComponent(options.query)}`,
        source: 'Wikipedia',
        relevanceScore: 0.9,
        publishDate: new Date().toISOString(),
      },
      {
        title: `${options.query} Official Website`,
        snippet: `Official information about ${options.query}, including company profile, leadership, and recent news.`,
        url: `https://www.${options.query.toLowerCase().replace(/\s+/g, '')}.com`,
        source: 'Official Website',
        relevanceScore: 0.95,
        publishDate: new Date().toISOString(),
      },
      {
        title: `${options.query} - Crunchbase Profile`,
        snippet: `Funding information, investors, and company details for ${options.query}.`,
        url: `https://www.crunchbase.com/organization/${options.query.toLowerCase().replace(/\s+/g, '-')}`,
        source: 'Crunchbase',
        relevanceScore: 0.85,
        publishDate: new Date().toISOString(),
      },
      {
        title: `${options.query} Financial Performance`,
        snippet: `Latest financial results, revenue data, and market performance for ${options.query}.`,
        url: `https://finance.yahoo.com/quote/${options.query.toUpperCase()}`,
        source: 'Yahoo Finance',
        relevanceScore: 0.8,
        publishDate: new Date().toISOString(),
      },
      {
        title: `${options.query} Industry Analysis`,
        snippet: `Market position, competitors, and industry analysis for ${options.query}.`,
        url: `https://www.industry-analysis.com/${options.query.toLowerCase().replace(/\s+/g, '-')}`,
        source: 'Industry Analysis',
        relevanceScore: 0.75,
        publishDate: new Date().toISOString(),
      },
    ];

    // Return subset based on maxResults
    const maxResults = options.maxResults || 10;
    return mockResults.slice(0, Math.min(maxResults, mockResults.length));
  }

  /**
   * Filters search results based on research options
   *
   * @param results - Raw search results
   * @param options - Research options
   * @returns Filtered results
   */
  private filterResults(results: SearchResult[], options: ResearchOptions): SearchResult[] {
    let filtered = [...results];

    // Filter by excluded domains
    if (options.excludeDomains && options.excludeDomains.length > 0) {
      filtered = filtered.filter(result => {
        const url = result.url || '';
        return !options.excludeDomains!.some(domain => url.includes(domain));
      });
    }

    // Filter by specific sources
    if (options.sources && options.sources.length > 0) {
      filtered = filtered.filter(result =>
        options.sources!.some(source =>
          result.source.toLowerCase().includes(source.toLowerCase())
        )
      );
    }

    // Filter by relevance score
    const minRelevance = this.getMinRelevanceScore(options.depth);
    filtered = filtered.filter(result => result.relevanceScore >= minRelevance);

    return filtered;
  }

  /**
   * Ranks search results by relevance and other factors
   *
   * @param results - Filtered search results
   * @returns Ranked search results
   */
  private rankResults(results: SearchResult[]): SearchResult[] {
    return results.sort((a, b) => {
      // Primary sort: relevance score
      if (b.relevanceScore !== a.relevanceScore) {
        return b.relevanceScore - a.relevanceScore;
      }

      // Secondary sort: publish date (more recent first)
      if (a.publishDate && b.publishDate) {
        return new Date(b.publishDate).getTime() - new Date(a.publishDate).getTime();
      }

      // Tertiary sort: source authority
      const sourceAuthority = this.getSourceAuthorityScore(a.source) - this.getSourceAuthorityScore(b.source);
      if (sourceAuthority !== 0) {
        return sourceAuthority;
      }

      return 0;
    });
  }

  /**
   * Gets the minimum relevance score based on research depth
   *
   * @param depth - Research depth level
   * @returns Minimum relevance score threshold
   */
  private getMinRelevanceScore(depth?: string): number {
    switch (depth) {
      case 'comprehensive':
        return 0.6;
      case 'recent':
        return 0.7;
      case 'quick':
      default:
        return 0.8;
    }
  }

  /**
   * Gets source authority score for ranking
   *
   * @param source - Source name
   * @returns Authority score
   */
  private getSourceAuthorityScore(source: string): number {
    const authorityScores: Record<string, number> = {
      'Official Website': 10,
      'Wikipedia': 9,
      'Reuters': 9,
      'Bloomberg': 9,
      'Yahoo Finance': 8,
      'Crunchbase': 8,
      'Industry Analysis': 7,
      'News': 6,
    };

    return authorityScores[source] || 5;
  }

  /**
   * Fetches full content from a URL
   * In a real implementation, this would perform HTTP requests and content extraction
   *
   * @param url - URL to fetch content from
   * @returns Extracted content
   */
  async fetchContent(url: string): Promise<string> {
    // Simulate content fetching
    await this.delay(this.requestDelay);

    // Return mock content
    return `This is the extracted content from ${url}. In a real implementation, this would contain the actual web page content after processing and cleaning.`;
  }

  /**
   * Closes the researcher and cleans up resources
   */
  close(): void {
  }

  /**
   * Utility function to create delays
   *
   * @param ms - Milliseconds to delay
   * @returns Promise that resolves after the delay
   */
  private delay(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }

  /**
   * Generates a unique research session ID
   *
   * @returns Unique session ID
   */
  generateSessionId(): string {
    return `research_${randomUUID()}`;
  }

  /**
   * Validates research options
   *
   * @param options - Research options to validate
   * @throws Error if options are invalid
   */
  private validateOptions(options: ResearchOptions): void {
    if (!options.query || options.query.trim().length === 0) {
      throw new Error('Query is required and cannot be empty');
    }

    if (options.maxResults && (options.maxResults < 1 || options.maxResults > 100)) {
      throw new Error('Max results must be between 1 and 100');
    }

    if (options.depth && !['quick', 'comprehensive', 'recent'].includes(options.depth)) {
      throw new Error('Depth must be one of: quick, comprehensive, recent');
    }
  }
}

export { WebResearcher, type ResearchOptions, type SearchResult };