gh-shrwnsan-vibekit-claude-…/hooks/handle-web-search.mjs

// hooks/handle-web-search.mjs
import { tavily, extractContent } from './content-extractor.mjs';
import { handleWebSearchError } from './handle-search-error.mjs';

// Configuration for environment variable namespacing
const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null;
const JINAAI_API_KEY = process.env.SEARCH_PLUS_JINAAI_API_KEY || process.env.JINAAI_API_KEY || null;

// Show deprecation warnings for old variable names
if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) {
  console.warn('⚠️  TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY');
}
if (!process.env.SEARCH_PLUS_JINAAI_API_KEY && process.env.JINAAI_API_KEY) {
  console.warn('⚠️  JINAAI_API_KEY is deprecated. Please update to SEARCH_PLUS_JINAAI_API_KEY');
}

/**
 * Detects if the input is a URL
 * @param {string} input - The input to check
 * @returns {boolean} True if the input is a URL
 */
function isURL(input) {
  try {
    const url = new URL(input);
    return url.protocol === 'http:' || url.protocol === 'https:';
  } catch {
    return false;
  }
}

/**
 * Handles web search requests with enhanced error handling
 * @param {Object} params - Search parameters
 * @returns {Object} Search results or error information
 */
export async function handleWebSearch(params) {
  const query = params.query || params.q || '';
  const maxRetries = params.maxRetries || 3;
  const timeout = params.timeout || 10000; // 10 seconds default

  if (!query) {
    return {
      error: true,
      message: 'No search query or URL provided'
    };
  }

  // Check if the query is a URL and handle extraction
  if (isURL(query)) {
    console.log(`🔍 Extracting content from URL: ${query}`);
    const result = await handleURLExtraction(query, { maxRetries, timeout });

    // Provide brief status feedback
    if (result.success) {
      console.log(`✅ URL extraction completed successfully`);
    } else {
      console.log(`❌ URL extraction failed: ${result.message}`);
    }

    return result;
  }

  // Provide status feedback for search queries
  if (!isURL(query)) {
    console.log(`🔍 Searching: ${query}`);
  }

  // Use hybrid search strategy
  try {
    const searchParams = {
      query,
      maxResults: params.maxResults || 5,
      includeAnswer: params.includeAnswer !== false,
      includeRawContent: params.includeRawContent || false,
      headers: generateRandomHeaders()
    };

    const result = await performHybridSearch(searchParams, timeout);

    return {
      success: true,
      data: result.data,
      service: result.service,
      attempt: 1
    };

  } catch (error) {
    console.error('All search strategies failed:', error.message);

    // Final error handling for recovery attempts
    const errorResult = await handleWebSearchError(error, {
      query,
      maxResults: params.maxResults || 5,
      includeAnswer: params.includeAnswer || true,
      includeRawContent: params.includeRawContent || false,
      headers: generateRandomHeaders(),
      timeout,
      attempt: 1,
      error: error
    });

    if (errorResult && errorResult.success) {
      return {
        success: true,
        data: errorResult.data,
        attempt: 1,
        errorRecovered: true,
        originalError: error.message,
        recoveryMessage: errorResult.message
      };
    }

    return {
      error: true,
      message: errorResult?.message || error.message,
      attempt: 1,
      errorHandlingApplied: true
    };
  }
}

/**
 * Hybrid web search with intelligent service selection
 * Sequential: Tavily → Parallel free services
 * Note: Jina API is only used for URL extraction, not web search
 */
async function performHybridSearch(params, timeoutMs = 10000) {
  // Phase 1: Try Tavily API (premium service)
  if (TAVILY_API_KEY) {
    try {
      console.log('🚀 Trying Tavily API...');
      const result = await tavily.search(params, timeoutMs);
      return { data: result, service: 'tavily' };
    } catch (error) {
      console.log('🔄 Tavily failed, trying free services...');
    }
  }

  // Phase 2: Parallel execution for free services
  console.log('🌐 Trying all free search engines in parallel...');
  const freeStrategies = [
    trySearXNGSearch(params, timeoutMs),
    tryDuckDuckGoHTML(params, timeoutMs),
    tryStartpageHTML(params, timeoutMs)
  ];

  try {
    const result = await Promise.any(freeStrategies);
    console.log(`✅ Success with free service: ${result.service}`);
    return result;
  } catch (aggregateError) {
    throw new Error('All search services failed. Try again or configure Tavily API key for enhanced reliability.');
  }
}


/**
 * Attempts search using SearXNG metasearch engine
 */
async function trySearXNGSearch(params, timeoutMs = 10000) {
  const searxngInstances = [
    'https://search.brave.works',
    'https://searx.be',
    'https://searx.tiekoetter.com',
    'https://search.snopyta.org'
  ];

  const query = encodeURIComponent(params.query);
  const maxResults = params.maxResults || 5;

  for (const instance of searxngInstances) {
    try {
      const searchUrl = `${instance}/search?q=${query}&format=json&engines=google,duckduckgo,startpage&results=${maxResults}`;

      const response = await fetch(searchUrl, {
        method: 'GET',
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
          'Accept': 'application/json, text/plain, */*',
          'Accept-Language': 'en-US,en;q=0.9',
          'Accept-Encoding': 'gzip, deflate',
          'Connection': 'keep-alive',
          'Referer': instance,
          'Sec-Fetch-Dest': 'empty',
          'Sec-Fetch-Mode': 'cors',
          'Sec-Fetch-Site': 'same-origin',
          ...params.headers
        },
        signal: AbortSignal.timeout(timeoutMs)
      });

      if (!response.ok) {
        continue; // Try next instance
      }

      const data = await response.json();

      if (!data.results || data.results.length === 0) {
        continue; // Try next instance
      }

      // Transform SearXNG results to Tavily-like format
      const transformedResults = {
        results: data.results.slice(0, maxResults).map((item, index) => ({
          title: item.title,
          url: item.url,
          content: item.content || '',
          score: 1.0 - (index * 0.1), // Simple scoring
          published_date: item.publishedDate || null
        })),
        answer: data.answers?.[0] || null,
        query: params.query,
        response_time: Date.now() - performance.now()
      };

      return { data: transformedResults, service: 'searxng' };

    } catch (error) {
      console.log(`❌ SearXNG instance ${instance} failed: ${error.message}`);
      continue; // Try next instance
    }
  }

  throw new Error('All SearXNG instances failed');
}

/**
 * Attempts search using DuckDuckGo HTML parsing
 */
async function tryDuckDuckGoHTML(params, timeoutMs = 10000) {
  const query = encodeURIComponent(params.query);
  const maxResults = params.maxResults || 5;

  const searchUrl = `https://html.duckduckgo.com/html/?q=${query}&kl=us-en`;

  const response = await fetch(searchUrl, {
    method: 'GET',
    headers: {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.9',
      'Accept-Encoding': 'gzip, deflate, br',
      'Connection': 'keep-alive',
      'Upgrade-Insecure-Requests': '1',
      'Sec-Fetch-Dest': 'document',
      'Sec-Fetch-Mode': 'navigate',
      'Sec-Fetch-Site': 'none',
      'Cache-Control': 'max-age=0',
      ...params.headers
    },
    signal: AbortSignal.timeout(timeoutMs)
  });

  if (!response.ok) {
    throw new Error(`DuckDuckGo HTML error: ${response.status}`);
  }

  const html = await response.text();

  // Parse HTML results
  const results = [];
  const resultRegex = /<div class="result">[\s\S]*?<a rel="nofollow" class="result__a" href="([^"]+)">([^<]+)<\/a>[\s\S]*?<a class="result__snippet" href="[^"]*">([^<]*)<\/a>/g;

  let match;
  while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
    const [, url, title, snippet] = match;

    if (url && title && !url.includes('//r.jina.ai/http')) { // Filter out redirect links
      results.push({
        title: title.trim(),
        url: url.startsWith('http') ? url : `https:${url}`,
        content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
        score: 1.0 - (results.length * 0.1)
      });
    }
  }

  if (results.length === 0) {
    throw new Error('No results found in DuckDuckGo HTML response');
  }

  const transformedResults = {
    results,
    answer: null, // DuckDuckGo doesn't provide instant answers in HTML mode
    query: params.query,
    response_time: Date.now() - performance.now()
  };

  return { data: transformedResults, service: 'duckduckgo-html' };
}

/**
 * Attempts search using Startpage HTML parsing
 */
async function tryStartpageHTML(params, timeoutMs = 10000) {
  const query = encodeURIComponent(params.query);
  const maxResults = params.maxResults || 5;

  const searchUrl = `https://www.startpage.com/do/search?query=${query}&cat=web&pl=ext-ff&extVersion=1.3.0`;

  const response = await fetch(searchUrl, {
    method: 'GET',
    headers: {
      'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.9',
      'Accept-Encoding': 'gzip, deflate, br',
      'Connection': 'keep-alive',
      'Upgrade-Insecure-Requests': '1',
      'Sec-Fetch-Dest': 'document',
      'Sec-Fetch-Mode': 'navigate',
      'Sec-Fetch-Site': 'none',
      'Cache-Control': 'max-age=0',
      ...params.headers
    },
    signal: AbortSignal.timeout(timeoutMs)
  });

  if (!response.ok) {
    throw new Error(`Startpage HTML error: ${response.status}`);
  }

  const html = await response.text();

  // Parse HTML results (Startpage format)
  const results = [];
  const resultRegex = /<h3><a href="([^"]+)"[^>]*>([^<]+)<\/a><\/h3>[\s\S]*?<p class="snippet">([^<]*)<\/p>/g;

  let match;
  while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
    const [, url, title, snippet] = match;

    if (url && title) {
      results.push({
        title: title.trim(),
        url: url.startsWith('http') ? url : `https:${url}`,
        content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
        score: 1.0 - (results.length * 0.1)
      });
    }
  }

  if (results.length === 0) {
    throw new Error('No results found in Startpage HTML response');
  }

  const transformedResults = {
    results,
    answer: null,
    query: params.query,
    response_time: Date.now() - performance.now()
  };

  return { data: transformedResults, service: 'startpage-html' };
}

/**
 * Generate random headers to avoid detection
 * @returns {Object} Random headers object
 */
function generateRandomHeaders() {
  const userAgents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
  ];

  return {
    'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
  };
}

/**
 * Determines if an error is retryable
 * @param {Error} error - The error to check
 * @returns {boolean} True if the error is retryable
 */
function isRetryableError(error) {
  // 403, 422, 429, 451, ECONNREFUSED, ETIMEDOUT are retryable
  const errorMessage = error.message || '';
  const errorString = JSON.stringify(error);

  return error.code === 403 ||
         error.code === 422 ||
         error.code === 429 ||
         error.code === 451 ||
         error.code === 'ECONNREFUSED' ||
         error.code === 'ETIMEDOUT' ||
         errorMessage.includes('403') ||
         errorMessage.includes('422') ||
         errorMessage.includes('429') ||
         errorMessage.includes('451') ||
         errorMessage.includes('SecurityCompromiseError') ||
         errorMessage.includes('blocked until') ||
         errorMessage.includes('ECONNREFUSED') ||
         errorMessage.includes('ETIMEDOUT') ||
         // Check for schema validation patterns
         errorString.toLowerCase().includes('missing') ||
         errorString.toLowerCase().includes('input_schema') ||
         errorString.toLowerCase().includes('field required');
}

/**
 * Handles URL extraction with retry logic
 * @param {string} url - The URL to extract content from
 * @param {Object} options - Extraction options
 * @returns {Object} Extraction results or error information
 */
async function handleURLExtraction(url, options = {}) {
  const { maxRetries = 3, timeout = 15000 } = options;

  for (let attempt = 0; attempt <= maxRetries; attempt++) {
    try {
      // Add random delay to avoid rate limiting
      if (attempt > 0) {
        const delay = Math.min(1000 * Math.pow(2, attempt), 8000); // Exponential backoff up to 8s
        await new Promise(resolve => setTimeout(resolve, delay));
      }

      // Try to extract content with custom headers
      const extractOptions = {
        headers: generateRandomHeaders(),
        includeImages: false, // Don't include images by default for faster processing
        ...options
      };

      const results = await extractContent(url, extractOptions);

      return {
        success: true,
        data: results,
        attempt: attempt + 1,
        isURLExtraction: true
      };

    } catch (error) {
      console.error(`URL extraction attempt ${attempt + 1} failed:`, error.message);

      // Check if it's a retryable error
      if (attempt === maxRetries || !isRetryableError(error)) {
        return {
          error: true,
          message: `Failed to extract content from URL: ${error.message}`,
          attempt: attempt + 1,
          isURLExtraction: true
        };
      }

      // Continue to next attempt
    }
  }
}