From 590188e7929328e05c66c5fdd63b3444524b6a90 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:57:03 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 20 + README.md | 3 + agents/search-plus.md | 91 ++ commands/search-plus.md | 68 ++ hooks/content-extractor.mjs | 1706 +++++++++++++++++++++++++++++++++ hooks/handle-rate-limit.mjs | 117 +++ hooks/handle-search-error.mjs | 887 +++++++++++++++++ hooks/handle-web-search.mjs | 458 +++++++++ hooks/hooks.json | 16 + plugin.lock.json | 73 ++ skills/search-plus/SKILL.md | 72 ++ 11 files changed, 3511 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 agents/search-plus.md create mode 100644 commands/search-plus.md create mode 100644 hooks/content-extractor.mjs create mode 100644 hooks/handle-rate-limit.mjs create mode 100644 hooks/handle-search-error.mjs create mode 100644 hooks/handle-web-search.mjs create mode 100644 hooks/hooks.json create mode 100644 plugin.lock.json create mode 100644 skills/search-plus/SKILL.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..01e52dc --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,20 @@ +{ + "name": "search-plus", + "description": "Enhanced web search with multi-service fallback architecture (Tavily + Jina.ai) and comprehensive error handling", + "version": "2.7.1", + "author": { + "name": "shrwnsan" + }, + "skills": [ + "./skills" + ], + "agents": [ + "./agents" + ], + "commands": [ + "./commands" + ], + "hooks": [ + "./hooks" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..76ed51e --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# search-plus + +Enhanced web search with multi-service fallback architecture (Tavily + Jina.ai) and comprehensive error handling diff --git a/agents/search-plus.md b/agents/search-plus.md new file mode 100644 index 0000000..a2fca4c --- /dev/null +++ b/agents/search-plus.md @@ -0,0 +1,91 @@ +--- +name: search-plus +description: Enhanced web search and content extraction with intelligent multi-service fallback strategy for reliable access to blocked or problematic domains +model: inherit +skills: meta-searching +--- + +# Search Plus Agent + +Purpose-built subagent for reliable web research and URL extraction. It interprets a query or URL, selects an extraction/search path, applies robust retries, and returns validated, cited results. + +## Inputs +- Query: natural language research question +- URL(s): one or more explicit links to extract +- Constraints (optional): cost/speed preference, max tokens, domains to avoid + +## Outputs +Structured response with: +- Summary: concise answer or extracted content synopsis +- Sources: list of { url, title?, service, status, content_type, length_tokens, error? } +- Details: key findings or sections +- Confidence: low/medium/high with brief rationale +- Notes: rate-limit handling, fallbacks used, remaining gaps + +## Operating Procedure (Runbook) +1) Interpret intent +- Detect if input is URL extraction vs open-ended research; extract all URLs present. +- Identify domain class (docs, news, forums, APIs) and sensitivity (likely to block/captcha). + +2) Choose path +- URL present → Extraction mode +- No URL → Research mode (search → fetch top candidates → extract) + +3) Primary attempt +- Perform search/fetch using default project tools; prioritize fast/reliable provider. +- Normalize and clean content (strip boilerplate, preserve headings/code/links). + +4) Fallback gating (trigger if any): +- HTTP ≥400 (403/404/422/429), empty/near-empty content, obvious paywall/captcha, or target domain in “problematic sites”. + +5) Fallback sequence +- Retry with exponential backoff (respect Retry-After) and jitter. +- Switch service/provider; vary request params and user-agent where supported. +- Prefer documentation-friendly readers for docs.*, readthedocs, github raw, etc. +- Cap retries: 2 attempts primary + 2 attempts fallback; stop early on strong success. + +6) Validation and dedupe +- Require non-empty content with ≥ N characters/tokens; dedupe by canonical URL. +- If multiple candidates, rank by relevance, freshness, and completeness. + +7) Summarize and cite +- Produce concise summary with inline citations; include brief methodology only if helpful. + +8) Return structured output +- Include sources with statuses and any errors encountered for transparency. + +## Error Handling Policy +- 403 Forbidden: backoff + alt service; try doc-friendly readers for docs/public sites. +- 429 Rate Limited: honor Retry-After; increase jitter; reduce concurrency. +- 422 Validation: simplify query/params; alternate request shape; reattempt search then fetch. +- ECONNREFUSED/ETIMEDOUT: alternate resolver/service; short cooldown before retry. +- Circuit breaker: abort after capped attempts and report best partial results. + +## Decision Heuristics +- docs.*, readthedocs, *.api.*, github content → prefer documentation readers. +- news/finance (e.g., finance.yahoo.com) → prioritize services with high success on dynamic pages. +- forums/reddit → use readers tolerant of anti-bot measures. +- If page text < threshold or only nav captured → try alternate extractor immediately. + +## Stop Conditions +- High-quality content acquired and summarized; or +- Capped retries reached; or +- Repeated non-retryable errors (4xx except 429) with no viable alternative. + +## Escalation +- Ask for alternate URL or narrower query if content is explicitly paywalled/private. +- Provide top N alternative sources when primary fails. +- Return partial findings with clear caveats and next steps. + +## Do / Don’t +- Do confirm intent briefly, cite all sources, chunk long docs logically. +- Do minimize calls; avoid redundant fetches; cache awareness where available. +- Don’t hallucinate unseen content; don’t loop beyond caps; don’t ignore Retry-After. + +## Examples +- Research: “Compare Claude Code plugin marketplaces and list key differences.” +- URL extract: “Summarize https://docs.anthropic.com/en/docs/claude-code/plugins.” +- Recovery: “Standard search failed with 429; retry with robust error handling.” + +## Notes +- Tooling is inherited (no tools listed in frontmatter), allowing this subagent to use the same approved set as the parent context, including plugin skills and MCP tools when available. diff --git a/commands/search-plus.md b/commands/search-plus.md new file mode 100644 index 0000000..8c93cb8 --- /dev/null +++ b/commands/search-plus.md @@ -0,0 +1,68 @@ +--- +description: Enhanced web search with comprehensive error handling for 403, 422, 429, and ECONNREFUSED errors +usage: /search-plus +parameters: + - name: query + type: string + required: true + description: The search query to execute or URL to extract content from +subagent_type: search-plus +--- + +# Enhanced Web Search Command + +Implements robust web search functionality that handles various blocking mechanisms and URL content extraction: + +## Features + +- Advanced retry logic for failed requests +- Multiple search engine integration +- Request header manipulation to avoid detection +- Connection pooling and timeout management +- Result caching to reduce repeated requests +- Proxy support with rotation +- Rate limiting compliance +- **URL Content Extraction**: Direct content extraction from URLs and permalinks + +## Usage + +### Web Search +```bash +/search-plus "Claude Code plugin documentation" +/search-plus "best practices for API rate limiting" +``` + +### URL Content Extraction +```bash +/search-plus "https://docs.anthropic.com/en/docs/claude-code/plugins" +/search-plus "https://github.com/example/repo" +/search-plus "https://example.com/article" +``` + +## Error Handling + +- Automatic retry for 403 Forbidden errors +- Schema validation error handling for 422 Unprocessable Entity errors +- API version compatibility and "Did 0 searches..." scenarios +- Rate limiting detection and 429 error handling +- Connection refused error handling +- Timeout management with configurable limits +- Fallback to alternative search engines when primary fails +- URL extraction error recovery with retry logic +- Query reformulation and parameter simplification for validation errors + +## Implementation + +This command enhances Claude Code's existing search functionality by adding comprehensive error handling specifically designed to work around rate limiting, blocking mechanisms, and API validation issues that cause 403, 422, 429, and ECONNREFUSED errors. + +The process follows this flow: +1. **Input Detection**: Determine if input is a search query or URL +2. **For URLs**: Extract content directly using Tavily Extract API with retry logic +3. **For Queries**: Perform web search with enhanced error handling +4. **Error Recovery**: If errors detected (403, 422, 429, ECONNREFUSED, schema validation): + - Classify error type and determine retry strategy + - For 422 errors: Reformulate query, adjust parameters, simplify request structure + - For 403/429 errors: Apply exponential backoff, header rotation, jitter + - For ECONNREFUSED: Retry with alternative connections and increased timeouts + - If retry fails, try alternative search method +5. **Return Results**: Formatted results with extraction metadata when applicable \ No newline at end of file diff --git a/hooks/content-extractor.mjs b/hooks/content-extractor.mjs new file mode 100644 index 0000000..e134a32 --- /dev/null +++ b/hooks/content-extractor.mjs @@ -0,0 +1,1706 @@ +// hooks/content-extractor.mjs +import { setTimeout } from 'timers/promises'; +import { promises as dns } from 'dns'; +import net from 'net'; + +/** + * Enhanced Content Extractor with Service Selection Strategy + * + * Implements optimal fallback strategy based on comprehensive testing: + * Primary: Tavily Extract API (100% success rate, 863ms avg) - FASTEST AND MOST RELIABLE + * Fallback: Jina.ai Public Endpoint (75% success rate, 1,066ms avg) - Good for documentation + * Optional: Jina.ai API (88% success rate, 2,331ms avg) - Slower, for cost tracking only + */ + +// Scalable fallback service definitions +const FALLBACK_SERVICES = { + cacheServices: [ + { + name: 'Google Web Cache', + pattern: (url) => `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`, + timeout: 15000, + priority: 1, + notes: 'Google web cache - fastest but sometimes blocked' + }, + { + name: 'Internet Archive JSON API', + pattern: async (url) => { + try { + const response = await fetch(`https://archive.org/wayback/available?url=${encodeURIComponent(url)}`, { + timeout: 10000, + headers: { 'Accept': 'application/json' } + }); + const data = await response.json(); + if (data.archived_snapshots?.closest?.available) { + return data.archived_snapshots.closest.url; + } + return null; + } catch (error) { + return null; + } + }, + timeout: 15000, + priority: 2, + notes: 'Archive.org official API - most reliable for older content' + }, + { + name: 'Internet Archive Direct', + pattern: (url) => `https://web.archive.org/web/2/${encodeURIComponent(url)}`, + timeout: 20000, + priority: 3, + notes: 'Direct archive.org access' + }, + { + name: 'Bing Cache', + pattern: (url) => `https://cc.bingj.com/cache.aspx?d=&w=${encodeURIComponent(url)}`, + timeout: 20000, + priority: 4, + notes: 'Microsoft Bing cache - alternative to Google' + }, + { + name: 'Yandex Turbo', + pattern: (url) => `https://yandex.com/turbo?text=${encodeURIComponent(url)}`, + timeout: 15000, + priority: 5, + notes: 'Yandex turbo mode - often good for news/blog content' + } + ], + jinaFormats: [ + { + name: 'Standard', + pattern: (url) => url, + timeout: 10000 + }, + { + name: 'Double Redirect', + pattern: (url) => `https://r.jina.ai/http://${encodeURIComponent(url)}`, + timeout: 12000 + }, + { + name: 'Triple Redirect', + pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://${encodeURIComponent(url)}`, + timeout: 15000 + }, + { + name: 'Text Extractor', + pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://textise dot iitty?url=${encodeURIComponent(url)}`, + timeout: 10000 + } + ], + userAgents: [ + { + name: 'Chrome Browser', + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0' + }, + timeout: 30000 + }, + { + name: 'cURL', + headers: { + 'User-Agent': 'curl/8.0.0', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive' + }, + timeout: 20000 + }, + { + name: 'Python Requests', + headers: { + 'User-Agent': 'python-requests/2.31.0', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive' + }, + timeout: 15000 + }, + { + name: 'Wget', + headers: { + 'User-Agent': 'Wget/1.21.3', + 'Accept': '*/*', + 'Accept-Encoding': 'identity' + }, + timeout: 25000 + } + ] +}; + +// Service configuration with fallback for backward compatibility +const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null; +const JINA_API_KEY = process.env.SEARCH_PLUS_JINA_API_KEY || process.env.JINA_API_KEY || null; + +// Show deprecation warning if using old variables +if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) { + console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY'); +} +if (!process.env.SEARCH_PLUS_JINA_API_KEY && process.env.JINA_API_KEY) { + console.warn('⚠️ JINA_API_KEY is deprecated. Please update to SEARCH_PLUS_JINA_API_KEY'); +} +const TAVILY_EXTRACT_URL = 'https://api.tavily.com/extract'; +const JINA_READER_PUBLIC_URL = 'https://r.jina.ai/'; +const JINA_READER_API_URL = 'https://r.jina.ai/'; + +// Service configuration based on research findings +const SERVICES = { + tavily: { + name: 'Tavily Extract API', + url: TAVILY_EXTRACT_URL, + successRate: 100, + avgResponseTime: 863, + cost: 'paid', + requiresAuth: true, + bestFor: ['general', 'problematic_domains', 'financial', 'social_media', 'primary_choice'] + }, + jinaPublic: { + name: 'Jina.ai Public Reader', + url: JINA_READER_PUBLIC_URL, + successRate: 75, + avgResponseTime: 1066, + cost: 'free', + requiresAuth: false, + bestFor: ['documentation', 'api_docs', 'technical_content'] + }, + jinaAPI: { + name: 'Jina.ai API Reader', + url: JINA_READER_API_URL, + successRate: 88, + avgResponseTime: 2331, + cost: 'free', + requiresAuth: true, + bestFor: ['enhanced_metadata', 'reliability'] // 2.7x slower - provides detailed analytics + } +}; + +/** + * Determines if a URL is likely to be documentation-heavy + * Based on research showing Jina.ai excels at documentation extraction + */ +function isDocumentationSite(url) { + const docPatterns = [ + /docs?\./, + /documentation/, + /api.*docs/, + /developer/, + /reference/, + /guide/, + /tutorial/, + /swagger/, + /openapi/, + /postman/, + /readthedocs/, + /gitbook/ + ]; + + return docPatterns.some(pattern => pattern.test(url.toLowerCase())); +} + +/** + * Determines if a URL is likely to be problematic for direct access + * Based on research showing Tavily handles these domains better + */ +function isProblematicDomain(url) { + const problematicPatterns = [ + /reddit\.com/, + /finance\.yahoo\.com/, + /twitter\.com/, + /facebook\.com/, + /instagram\.com/, + /linkedin\.com/, + /medium\.com/, + /news\./, + /coingecko\.com/, + /binance\.com/ + ]; + + return problematicPatterns.some(pattern => pattern.test(url.toLowerCase())); +} + +/** + * Validates Tavily API key with a simple test call + */ +async function validateTavilyAPIKey() { + if (!TAVILY_API_KEY) { + return { valid: false, reason: 'API key not configured' }; + } + + try { + const testResponse = await fetch('https://api.tavily.com/search', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + api_key: TAVILY_API_KEY, + query: 'test', + max_results: 1 + }), + signal: AbortSignal.timeout(5000) + }); + + if (testResponse.status === 401 || testResponse.status === 403) { + const errorData = await testResponse.json().catch(() => ({})); + return { + valid: false, + reason: `Invalid API key: ${errorData.detail?.error || 'Unauthorized'}` + }; + } + + return { valid: true }; + } catch (error) { + return { + valid: false, + reason: `API key validation failed: ${error.message}` + }; + } +} + +/** + * Extracts content using Tavily Extract API + */ +async function extractWithTavily(url, options = {}, timeoutMs = 15000) { + const startTime = Date.now(); + + if (!TAVILY_API_KEY) { + throw new Error('Tavily API key not configured'); + } + + const requestBody = { + api_key: TAVILY_API_KEY, + urls: [url.trim()] + }; + + // Add optional parameters + if (options.includeImages) requestBody.include_images = options.includeImages; + if (options.extractDepth) requestBody.extract_depth = options.extractDepth; + + try { + const controller = new AbortController(); + const timeoutId = setTimeout(timeoutMs, null).then(() => controller.abort()); + + const response = await fetch(TAVILY_EXTRACT_URL, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...options.headers + }, + body: JSON.stringify(requestBody), + signal: controller.signal + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`); + } + + const data = await response.json(); + const content = data.results && data.results[0] ? + data.results[0].content || data.results[0].raw_content : + ''; + + return { + success: true, + content, + contentLength: content.length, + service: 'tavily', + url, + responseTime: Date.now() - startTime, + metadata: { + service: SERVICES.tavily, + responseData: data, + hasResults: data.results && data.results.length > 0, + title: data.results && data.results[0] ? data.results[0].title : null + } + }; + + } catch (error) { + return { + success: false, + error: { + code: extractErrorCode(error.message), + message: error.message + }, + service: 'tavily', + url, + responseTime: Date.now() - startTime, + content: '', + metadata: { + service: SERVICES.tavily, + errorType: error.name + } + }; + } +} + +/** + * Extracts content using Jina.ai Public Endpoint + */ +async function extractWithJinaPublic(url, options = {}, timeoutMs = 10000) { + const startTime = Date.now(); + + try { + const jinaUrl = `${JINA_READER_PUBLIC_URL}${url}`; + + const response = await fetch(jinaUrl, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; Search-Plus-Content-Extractor/1.0)', + ...options.headers + }, + signal: AbortSignal.timeout(timeoutMs) + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Jina.ai Public error: ${response.status} - ${errorText}`); + } + + const content = await response.text(); + + return { + success: true, + content, + contentLength: content.length, + service: 'jinaPublic', + url, + responseTime: Date.now() - startTime, + metadata: { + service: SERVICES.jinaPublic, + responseStatus: response.status, + contentType: response.headers.get('content-type') + } + }; + + } catch (error) { + return { + success: false, + error: { + code: extractErrorCode(error.message), + message: error.message + }, + service: 'jinaPublic', + url, + responseTime: Date.now() - startTime, + content: '', + metadata: { + service: SERVICES.jinaPublic, + errorType: error.name + } + }; + } +} + +/** + * Extracts content using Jina.ai API (provides enhanced metadata and reliability) + */ +async function extractWithJinaAPI(url, options = {}, timeoutMs = 10000) { + const startTime = Date.now(); + + if (!JINA_API_KEY) { + throw new Error('Jina.ai API key not configured'); + } + + try { + const response = await fetch(JINA_READER_API_URL, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${JINA_API_KEY}`, + 'Content-Type': 'application/json', + 'Accept': 'application/json', + ...options.headers + }, + body: JSON.stringify({ + url: url, + ...options.jinaOptions + }), + signal: AbortSignal.timeout(timeoutMs) + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Jina.ai API error: ${response.status} - ${errorText}`); + } + + const data = await response.json(); + const content = data.data?.content || data.content || data.data || JSON.stringify(data); + + return { + success: true, + content, + contentLength: content.length, + service: 'jinaAPI', + url, + responseTime: Date.now() - startTime, + metadata: { + service: SERVICES.jinaAPI, + responseData: data, + tokenUsage: data.meta?.usage?.tokens || data.usage?.tokens, + title: data.data?.title + } + }; + + } catch (error) { + return { + success: false, + error: { + code: extractErrorCode(error.message), + message: error.message + }, + service: 'jinaAPI', + url, + responseTime: Date.now() - startTime, + content: '', + metadata: { + service: SERVICES.jinaAPI, + errorType: error.name + } + }; + } +} + +/** + * Scalable ultra-resilient fallback using pattern-based services + */ +async function tryUltraResilientFallbacks(url, originalOptions, results) { + log(`🚨 All standard services failed, trying ultra-resilient fallbacks...`); + + // Try 1: Enhanced Tavily with different user agents + if (!originalOptions.triedEnhancedParams && (!results.find(r => r.error?.message?.includes('Unauthorized')))) { + log(`🔧 Trying enhanced Tavily with different user agents...`); + + for (const userAgent of FALLBACK_SERVICES.userAgents.slice(0, 2)) { // Try top 2 user agents + try { + const enhancedResult = await extractWithTavily(url, { + ...originalOptions, + triedEnhancedParams: true, + ...userAgent + }); + + results.push(enhancedResult); + if (enhancedResult.success && enhancedResult.contentLength > 0) { + log(`✅ Enhanced Tavily (${userAgent.name}) extraction successful!`); + return { success: true, result: enhancedResult }; + } + } catch (error) { + log(`❌ Enhanced Tavily (${userAgent.name}) failed: ${error.message}`); + } + } + } + + // Try 2: Enhanced cache services (with async pattern support and prioritization) + if (!originalOptions.triedCacheServices) { + log(`🕐️ Trying enhanced cache services...`); + + // Get max archive attempts from configuration (default to all if not specified) + const maxAttempts = originalOptions.maxArchiveAttempts || FALLBACK_SERVICES.cacheServices.length; + + // Sort by priority and limit attempts + const sortedCacheServices = [...FALLBACK_SERVICES.cacheServices] + .sort((a, b) => a.priority - b.priority) + .slice(0, maxAttempts); + + log(` Will try up to ${maxAttempts} cache services out of ${FALLBACK_SERVICES.cacheServices.length} available`); + + for (const cacheService of sortedCacheServices) { + try { + let cacheURL; + + // Handle async pattern functions (like Internet Archive API) + if (typeof cacheService.pattern === 'function' && cacheService.constructor.name === 'AsyncFunction') { + cacheURL = await cacheService.pattern(url); + if (!cacheURL) { + log(`⚠️ ${cacheService.name}: No cached version available`); + continue; + } + } else { + cacheURL = cacheService.pattern(url); + } + + log(`🔍 Trying ${cacheService.name}: ${cacheURL.substring(0, 100)}...`); + + const cacheResult = await extractWithJinaPublic(cacheURL, { + ...originalOptions, + triedCacheServices: true, + timeout: cacheService.timeout + }); + + // Override service name to correctly identify which cache service was used + if (cacheResult.success) { + cacheResult.service = cacheService.name; + cacheResult.metadata.service = cacheService.name; + } + + results.push(cacheResult); + if (cacheResult.success && cacheResult.contentLength > 100) { + log(`✅ ${cacheService.name} extraction successful!`); + return { success: true, result: cacheResult }; + } + } catch (error) { + log(`❌ ${cacheService.name} failed: ${error.message}`); + } + } + } + + // Try 3: Alternative Jina formats (pattern-based) + if (!originalOptions.triedAltJina) { + log(`🔄 Trying alternative Jina AI formats...`); + + for (const jinaFormat of FALLBACK_SERVICES.jinaFormats) { + try { + const altURL = jinaFormat.pattern(url); + const altResult = await extractWithJinaPublic(altURL, { + ...originalOptions, + triedAltJina: true, + timeout: jinaFormat.timeout + }); + + results.push(altResult); + if (altResult.success && altResult.contentLength > 50) { + log(`✅ Jina AI (${jinaFormat.name}) extraction successful!`); + return { success: true, result: altResult }; + } + } catch (error) { + log(`❌ Jina AI (${jinaFormat.name}) failed: ${error.message}`); + } + } + } + + // Try 4: Connection/SSL workarounds with remaining user agents + const lastResult = results[results.length - 1]; + if (!originalOptions.triedSSLWorkaround && + (lastResult?.error?.message?.includes('certificate') || lastResult?.error?.message?.includes('SSL') || + lastResult?.error?.message?.includes('ECONNREFUSED') || lastResult?.error?.message?.includes('timeout'))) { + log(`🔐 Trying connection/SSL workarounds with remaining user agents...`); + + for (const userAgent of FALLBACK_SERVICES.userAgents.slice(2)) { // Skip first 2 as they were tried above + try { + const workaroundResult = await extractWithJinaPublic(url, { + ...originalOptions, + triedSSLWorkaround: true, + ...userAgent + }); + + results.push(workaroundResult); + if (workaroundResult.success && workaroundResult.contentLength > 0) { + log(`✅ SSL/Connection workaround (${userAgent.name}) extraction successful!`); + return { success: true, result: workaroundResult }; + } + } catch (error) { + log(`❌ SSL/Connection workaround (${userAgent.name}) failed: ${error.message}`); + } + } + } + + log(`🏁 Ultra-resilient fallback attempts completed (${results.length - 3} additional attempts)`); + return { success: false, result: lastResult }; +} + +/** + * Extracts error code from error message for classification + */ +function extractErrorCode(errorMessage) { + if (errorMessage.includes('403')) return '403'; + if (errorMessage.includes('429')) return '429'; + if (errorMessage.includes('451')) return '451'; + if (errorMessage.includes('400')) return '400'; + if (errorMessage.includes('404')) return '404'; + if (errorMessage.includes('timeout')) return 'TIMEOUT'; + if (errorMessage.includes('ECONNREFUSED')) return 'ECONNREFUSED'; + if (errorMessage.includes('incorrect header check')) return 'HEADER_CHECK'; + if (errorMessage.includes('SecurityCompromiseError')) return 'SECURITY_COMPROMISE'; + if (errorMessage.includes('Forbidden')) return 'FORBIDDEN'; + return 'UNKNOWN'; +} + +/** + * Smart 404 Configuration System + * Provides intelligent 404 handling with user-configurable modes + */ + +// Mode presets for different 404 handling strategies +const MODE_PRESETS = { + disabled: { + enabled: false, + archiveProbability: 0.0, + maxArchiveAttempts: 0, + description: 'Skip all archive attempts for 404 errors (fastest)' + }, + conservative: { + enabled: true, + archiveProbability: 0.3, + maxArchiveAttempts: 1, + description: 'Try archives for 30% of 404s, high-value domains only' + }, + normal: { + enabled: true, + archiveProbability: 0.7, + maxArchiveAttempts: 2, + description: 'Balanced approach for most use cases' + }, + aggressive: { + enabled: true, + archiveProbability: 1.0, + maxArchiveAttempts: 3, + description: 'Try all archives for every 404 (maximum recovery)' + } +}; + +/** + * Creates 404 configuration from user options + */ +function create404Config(options = {}) { + // Check environment variable first, then options, then default to normal mode + let mode = process.env.SEARCH_PLUS_404_MODE || options.mode || 'normal'; + + // Log if environment variable is being used + if (process.env.SEARCH_PLUS_404_MODE) { + log(`🌍 404 mode from environment variable: ${process.env.SEARCH_PLUS_404_MODE}`); + } + + // Validate mode + if (!MODE_PRESETS[mode]) { + log(`⚠️ Invalid 404 mode "${mode}", falling back to "normal"`); + mode = 'normal'; + } + + // Start with preset configuration + let config = { ...MODE_PRESETS[mode] }; + + // Override with specific options (power user customization) + if (options.archiveProbability !== undefined) { + config.archiveProbability = Math.max(0.0, Math.min(1.0, options.archiveProbability)); + } + + if (options.maxArchiveAttempts !== undefined) { + config.maxArchiveAttempts = Math.max(0, Math.min(5, options.maxArchiveAttempts)); + } + + if (options.enabled !== undefined) { + config.enabled = options.enabled; + } + + // Add domain classifications + config.highValueDomains = options.highValueDomains || [ + 'docs.', 'documentation.', 'help.', 'support.', + 'news.', 'blog.', 'article.', 'research.', + 'wikipedia.', 'github.', 'stackoverflow.', + 'medium.', 'dev.to', 'hashnode.' + ]; + + config.lowValuePatterns = options.lowValuePatterns || [ + 'api.', 'analytics.', 'ads.', 'tracking.', + 'cdn.', 'static.', 'assets.', 'temp-', + 'cache-', 'session-', 'token-' + ]; + + config.customRules = options.customRules || {}; + + return config; +} + +/** + * Detects 404 status from URL patterns (when content extraction fails) + */ +function detect404FromURL(url) { + if (!url || typeof url !== 'string') return { + detected: false, + patterns: [], + source: 'url' + }; + + const urlLower = url.toLowerCase(); + + // URL patterns that strongly indicate 404 status + const urlPatterns = [ + '/status/404', + '/error/404', + '/404.html', + '/not-found', + '/page-not-found' + ]; + + const detectedPatterns = urlPatterns.filter(pattern => urlLower.includes(pattern)); + + return { + detected: detectedPatterns.length > 0, + patterns: detectedPatterns, + source: 'url', + confidence: detectedPatterns.length > 0 ? 0.8 : 0.0 + }; +} + +/** + * Detects if content contains 404 error patterns + * Now used for intelligent decision-making instead of blocking + */ +function detect404Error(content) { + if (!content || typeof content !== 'string') return { + detected: false, + patterns: [] + }; + + const contentLower = content.toLowerCase(); + + // 404 indicator patterns + const patterns404 = [ + '404: not found', + 'error 404: not found', + 'this page can\'t be found', + 'page not found', + 'lost in space', + 'the page you\'re seeking might no longer exist', + 'target url returned error 404', + 'http 404', + 'status: 404', + 'this httpbin.org page can\'t be found' + ]; + + const detectedPatterns = []; + + // Check for 404 patterns + for (const pattern of patterns404) { + if (contentLower.includes(pattern)) { + detectedPatterns.push(pattern); + } + } + + return { + detected: detectedPatterns.length > 0, + patterns: detectedPatterns, + confidence: Math.min(detectedPatterns.length / 3, 1.0) + }; +} + +/** + * Determines if a URL should get archive recovery attempts + */ +function shouldTryArchives(url, detectionResult, config) { + // Quick disable checks + if (!config.enabled) return false; + if (!detectionResult.detected) return true; // Not a 404, always try + + // Probability check + if (Math.random() > config.archiveProbability) return false; + + // High-value domain check (always try for these) + if (isHighValueDomain(url, config)) return true; + + // Low-value pattern check (skip these unless aggressive mode) + if (isLowValueContent(url, config) && config.archiveProbability < 1.0) return false; + + // Custom rules check + for (const [domain, rule] of Object.entries(config.customRules)) { + if (url.includes(domain)) { + return rule === 'always' || (rule === 'try' && Math.random() < 0.5); + } + } + + return true; +} + +/** + * Checks if URL is from a high-value domain that deserves archive recovery + */ +function isHighValueDomain(url, config) { + const urlLower = url.toLowerCase(); + return config.highValueDomains.some(domain => urlLower.includes(domain)); +} + +/** + * Checks if URL is low-value content that doesn't need archive recovery + */ +function isLowValueContent(url, config) { + const urlLower = url.toLowerCase(); + return config.lowValuePatterns.some(pattern => urlLower.includes(pattern)); +} + +/** + * Validates if extracted content is meaningful or just service error pages + */ +function validateMeaningfulContent(content, source = 'unknown') { + if (!content || typeof content !== 'string' || content.trim().length === 0) { + return { + isMeaningful: false, + reason: 'empty_content', + source + }; + } + + const contentLower = content.toLowerCase(); + + // Patterns that indicate non-meaningful content (error pages, "no results" pages, etc.) + const uselessPatterns = [ + // Google Cache/Search error patterns + 'did not match any documents', + 'no cached version available', + 'accessibility links', + 'google apps', + 'your search -', + 'suggestions:', + 'make sure all words are spelled correctly', + 'footer links', + + // Jina.ai error patterns + 'jina ai reader', + 'failed to extract content', + 'extraction failed', + 'unable to access', + 'error 404', + 'error 403', + 'error 429', + 'error 451', + 'timeouterror', + 'navigation timeout', + + // Generic error patterns + 'page not found', + 'access denied', + 'forbidden', + 'rate limit', + 'service unavailable', + 'connection refused', + + // Cache service error patterns + 'wayback machine', + 'archive.org', + 'this page is not available', + 'cached page', + 'webcache.googleusercontent.com', + + // Minimal content patterns + 'title: cache:', + 'url source:', + 'markdown content:' + ]; + + // Check for useless patterns + for (const pattern of uselessPatterns) { + if (contentLower.includes(pattern)) { + return { + isMeaningful: false, + reason: 'useless_pattern_detected', + pattern: pattern, + source + }; + } + } + + // Check for extremely short content (likely error pages) + const contentLength = content.trim().length; + if (contentLength < 100) { + return { + isMeaningful: false, + reason: 'content_too_short', + length: contentLength, + source + }; + } + + // Check for content that's mostly HTML/structure without meaningful text + const textContent = content.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim(); + if (textContent.length < 50) { + return { + isMeaningful: false, + reason: 'insufficient_text_content', + textLength: textContent.length, + source + }; + } + + // Check for repetitive content (indicates error pages or broken extraction) + const words = textContent.split(' ').filter(w => w.length > 3); + const uniqueWords = new Set(words); + if (words.length > 10 && uniqueWords.size / words.length < 0.3) { + return { + isMeaningful: false, + reason: 'repetitive_content', + uniqueWordsRatio: uniqueWords.size / words.length, + source + }; + } + + return { + isMeaningful: true, + reason: 'meaningful_content_detected', + contentLength: contentLength, + textLength: textContent.length, + source + }; +} + +/** + * Determines the fallback level based on service used and number of attempts + */ +function determineFallbackLevel(service, totalAttempts) { + if (service === 'tavily') return 'primary'; + if (service === 'jinaPublic') return 'secondary'; + if (service === 'jinaAPI') return 'tertiary'; + if (totalAttempts > 4) return 'ultra_resilient'; + return 'unknown'; +} + +/** + * Determines the extraction strategy used + */ +function determineStrategy(isDoc, useEnhancedMetadata) { + if (useEnhancedMetadata) return 'tavily_first_enhanced_metadata'; + if (isDoc) return 'tavily_first_optimal_fallback'; + return 'tavily_first_default'; +} + +/** + * Checks if an IP address is in a private or reserved range. + * @param {string} ip - The IP address to check. + * @returns {boolean} - True if the IP is private, false otherwise. + */ +function isPrivateIP(ip) { + if (net.isIPv4(ip)) { + const parts = ip.split('.').map(part => parseInt(part, 10)); + // 127.0.0.0/8 - Loopback + if (parts[0] === 127) return true; + // 10.0.0.0/8 - Private + if (parts[0] === 10) return true; + // 172.16.0.0/12 - Private + if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true; + // 192.168.0.0/16 - Private + if (parts[0] === 192 && parts[1] === 168) return true; + // 169.254.0.0/16 - Link-local (includes AWS metadata service) + if (parts[0] === 169 && parts[1] === 254) return true; + } + // No IPv6 checks for now as per requirements, but can be added. + return false; +} + +/** + * Validates and normalizes malformed URLs before extraction + */ +async function validateAndNormalizeURL(url) { + const issues = []; + let normalizedURL = url; + + // Check for double protocol issues + if (url.includes('http://https://') || url.includes('https://http://')) { + issues.push('double_protocol'); + // Fix double protocol + normalizedURL = url.replace(/https?:\/\/https?:\/\//, 'https://'); + } + + // Check for spaces in URL (common issue from "textise dot iitty") + if (url.includes(' dot ') || url.includes(' ')) { + issues.push('spaces_in_domain'); + // Try to fix common patterns + normalizedURL = normalizedURL.replace(/ dot /g, '.').replace(/\s+/g, ''); + } + + // Check for malformed Jina AI URLs + if (url.includes('r.jina.ai/http://') && !url.includes('r.jina.ai/http://https://')) { + issues.push('malformed_jina_url'); + // This is actually the correct pattern for Jina AI + } + + // Basic URL validation and SSRF Protection + let parsedURL; + try { + parsedURL = new URL(normalizedURL); + } catch (error) { + issues.push('invalid_url_format'); + return { + valid: false, + issues, + error: `Invalid URL format: ${error.message}`, + originalURL: url, + normalizedURL: null + }; + } + + // SSRF Protection Step 1: Protocol check + if (parsedURL.protocol !== 'http:' && parsedURL.protocol !== 'https:') { + issues.push('invalid_protocol'); + return { + valid: false, + issues, + error: `SSRF attack detected: Invalid protocol '${parsedURL.protocol}'. Only HTTP and HTTPS are allowed.`, + originalURL: url, + normalizedURL + }; + } + + const { hostname } = parsedURL; + + // SSRF Protection Step 2: Hostname check + if (hostname === 'localhost' || hostname.endsWith('.local')) { + issues.push('forbidden_hostname'); + return { + valid: false, + issues, + error: `SSRF attack detected: Hostname '${hostname}' is forbidden.`, + originalURL: url, + normalizedURL + }; + } + + // SSRF Protection Step 3: Resolve hostname to IP and check + let ipAddress; + if (net.isIP(hostname)) { + ipAddress = hostname; + } else { + try { + const { address } = await dns.lookup(hostname); + ipAddress = address; + } catch (error) { + issues.push('dns_lookup_failed'); + return { + valid: false, + issues, + error: `DNS lookup failed for hostname: ${hostname}. ${error.message}`, + originalURL: url, + normalizedURL: null + }; + } + } + + if (isPrivateIP(ipAddress)) { + issues.push('private_ip_detected'); + return { + valid: false, + issues, + error: `SSRF attack detected: IP address ${ipAddress} is in a forbidden range.`, + originalURL: url, + normalizedURL + }; + } + + + // Check for obviously problematic domains that would cause API failures + const problematicPatterns = [ + /textise dot iitty/i, + /textise\.iitty/i, // The normalized version is still invalid + /example dot com/i, + /example\.com$/i, // Generic example domain + /test dot /i, + /\.com\.[a-z]/i, // Likely malformed TLD + /r\.jina\.ai\/http:\/\/[^/]*\.[a-z]{2,}\/?$/i // Jina AI with obviously fake domain + ]; + + for (const pattern of problematicPatterns) { + if (pattern.test(normalizedURL)) { + issues.push('suspicious_domain_pattern'); + break; + } + } + + // If we have suspicious patterns that can't be trusted, mark as invalid + if (issues.includes('suspicious_domain_pattern')) { + return { + valid: false, + issues, + error: `Unfixable URL issues: suspicious or test domain detected`, + originalURL: url, + normalizedURL: null + }; + } + + // If we have issues but can normalize, return the fixed version + if (issues.length > 0 && normalizedURL !== url) { + return { + valid: true, + issues, + originalURL: url, + normalizedURL, + hasFixes: true, + message: `URL normalized: ${issues.join(', ')}` + }; + } + + // If we have issues that can't be automatically fixed + if (issues.length > 0) { + return { + valid: false, + issues, + error: `Unfixable URL issues: ${issues.join(', ')}`, + originalURL: url, + normalizedURL: null + }; + } + + // URL is valid + return { + valid: true, + issues: [], + originalURL: url, + normalizedURL: url, + hasFixes: false + }; +} + +/** + * Performs comprehensive service health check + */ +async function performServiceHealthCheck() { + const healthStatus = { + tavily: { available: false, error: null }, + jinaPublic: { available: false, error: null }, + jinaAPI: { available: false, error: null } + }; + + // Check Tavily API + const tavilyValidation = await validateTavilyAPIKey(); + healthStatus.tavily.available = tavilyValidation.valid; + healthStatus.tavily.error = tavilyValidation.reason; + + // Check Jina Public + try { + const jinaTest = await fetch('https://r.jina.ai/http://example.com', { + method: 'GET', + signal: AbortSignal.timeout(5000) + }); + healthStatus.jinaPublic.available = jinaTest.ok; + if (!jinaTest.ok) { + healthStatus.jinaPublic.error = `HTTP ${jinaTest.status}`; + } + } catch (error) { + healthStatus.jinaPublic.error = error.message; + } + + // Check Jina API (if key is available) + if (JINA_API_KEY) { + try { + const jinaAPITest = await fetch('https://r.jina.ai/', { + method: 'POST', + headers: { + 'Authorization': `Bearer ${JINA_API_KEY}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ url: 'http://example.com' }), + signal: AbortSignal.timeout(5000) + }); + healthStatus.jinaAPI.available = jinaAPITest.ok; + if (!jinaAPITest.ok) { + healthStatus.jinaAPI.error = `HTTP ${jinaAPITest.status}`; + } + } catch (error) { + healthStatus.jinaAPI.error = error.message; + } + } else { + healthStatus.jinaAPI.available = false; + healthStatus.jinaAPI.error = 'API key not configured'; + } + + return healthStatus; +} + +/** + * Enhanced content extraction with optimal service selection strategy + * + * Strategy based on comprehensive research: + * 1. Always start with Tavily (100% success rate, 863ms fastest) - PRIMARY CHOICE + * 2. Documentation sites: Tavily First → Jina Public Fallback (better content for docs) + * 3. Cost tracking: Tavily First → Jina API Fallback (only for token tracking) + */ +export async function extractContent(url, options = {}) { + const startTime = Date.now(); + const results = []; + + // Perform service health check at the start + if (options.performHealthCheck !== false) { + log(`🔍 Performing service health check...`); + const healthStatus = await performServiceHealthCheck(); + + log(`📊 Service Health Status:`); + log(` Tavily: ${healthStatus.tavily.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.tavily.error}`); + log(` Jina Public: ${healthStatus.jinaPublic.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaPublic.error}`); + log(` Jina API: ${healthStatus.jinaAPI.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaAPI.error}`); + + // If no services are available, fail early + if (!healthStatus.tavily.available && !healthStatus.jinaPublic.available && !healthStatus.jinaAPI.available) { + return { + success: false, + error: { code: 'ALL_SERVICES_DOWN', message: 'All extraction services are unavailable' }, + content: '', + contentLength: 0, + service: 'none', + url, + responseTime: Date.now() - startTime, + totalAttempts: 0, + totalResponseTime: Date.now() - startTime, + healthStatus, + metadata: { + extractionStrategy: 'all_services_failed', + timestamp: new Date().toISOString() + } + }; + } + } + + // Initialize 404 configuration for smart handling + const config404 = create404Config(options.config404 || { mode: 'normal' }); + log(`🎯 404 Handling: ${config404.description}`); + + // Pre-validate and normalize URL before extraction + const urlValidation = await validateAndNormalizeURL(url); + let extractionURL = url; + + if (!urlValidation.valid) { + log(`❌ URL validation failed: ${urlValidation.error}`); + return { + success: false, + error: { + code: 'INVALID_URL', + message: urlValidation.error, + issues: urlValidation.issues + }, + content: '', + contentLength: 0, + service: 'validation', + url, + responseTime: Date.now() - startTime, + totalAttempts: 0, + totalResponseTime: Date.now() - startTime, + metadata: { + extractionStrategy: 'url_validation_failed', + timestamp: new Date().toISOString(), + originalURL: urlValidation.originalURL, + validationIssues: urlValidation.issues + } + }; + } + + if (urlValidation.hasFixes) { + log(`🔧 URL normalized: ${urlValidation.message}`); + log(` Original: ${urlValidation.originalURL}`); + log(` Normalized: ${urlValidation.normalizedURL}`); + extractionURL = urlValidation.normalizedURL; + } + + // Determine optimal strategy based on URL characteristics + const isDoc = isDocumentationSite(extractionURL); + const isProblematic = isProblematicDomain(extractionURL); + const useEnhancedMetadata = options.enhancedMetadata || options.highVolume; + + log(`🎯 Extracting content from: ${extractionURL}`); + if (extractionURL !== url) { + log(` (Original URL: ${url})`); + } + log(` URL Type: ${isDoc ? 'Documentation site' : isProblematic ? 'Problematic domain' : 'General URL'}`); + log(` Enhanced Metadata: ${useEnhancedMetadata ? 'enabled' : 'disabled'}`); + + let result; + + // Strategy 1: Always start with Tavily (research shows it's fastest and most reliable) + log(`🚀 Using Tavily first...`); + try { + result = await extractWithTavily(extractionURL, options); + results.push(result); + } catch (error) { + result = { + success: false, + error: { code: 'EXCEPTION', message: error.message }, + service: 'tavily', + url: extractionURL, + originalURL: url, + responseTime: Date.now() - startTime, + content: '', + contentLength: 0 + }; + results.push(result); + log(`❌ Tavily extraction failed with exception: ${error.message}`); + } + + // Determine fallback service based on specific needs and service availability + let fallbackService = 'jinaPublic'; // Default fallback + let fallbackReason = 'default'; + + if (useEnhancedMetadata && JINA_API_KEY) { + fallbackService = 'jinaAPI'; + fallbackReason = 'enhanced metadata requested'; + } else if (isDoc) { + fallbackService = 'jinaPublic'; + fallbackReason = 'documentation site'; + } + + // Enhanced fallback logic with better error detection + const needsFallback = !result.success || + result.error?.code === '401' || // Invalid API key + result.error?.code === '403' || // Forbidden + result.error?.code === '429' || // Rate limited + result.error?.code === 'EXCEPTION' || // Exception occurred + (result.contentLength === 0 && !options.skipEmptyFallback) || + (useEnhancedMetadata && !result.success); + + if (needsFallback) { + log(`⚠️ Tavily failed or returned empty, trying ${fallbackService} (${fallbackReason})...`); + log(` Failure reason: ${result.error?.code || result.error?.message || 'Empty content'}`); + + let fallbackResult; + try { + if (fallbackService === 'jinaAPI' && JINA_API_KEY) { + fallbackResult = await extractWithJinaAPI(extractionURL, options); + } else { + fallbackResult = await extractWithJinaPublic(extractionURL, options); + } + results.push(fallbackResult); + + // Use fallback if it succeeded + if (fallbackResult.success && (fallbackResult.contentLength > 0 || useEnhancedMetadata)) { + result = fallbackResult; + log(`✅ Fallback to ${fallbackService} successful`); + + // Smart 404 detection for logging and metrics + const detection404 = detect404Error(result.content); + if (detection404.detected) { + log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`); + result.fallback404Detection = detection404; + } + } else { + log(`❌ Fallback to ${fallbackService} failed: ${fallbackResult.error?.message || 'Empty content'}`); + } + } catch (error) { + log(`❌ Fallback to ${fallbackService} failed with exception: ${error.message}`); + fallbackResult = { + success: false, + error: { code: 'EXCEPTION', message: error.message }, + service: fallbackService, + url: extractionURL, + originalURL: url, + responseTime: Date.now() - startTime, + content: '', + contentLength: 0 + }; + results.push(fallbackResult); + } + } + + // Final fallback if needed (try the remaining service) + if ((!result.success || result.contentLength === 0) && !useEnhancedMetadata && JINA_API_KEY) { + const finalService = fallbackService === 'jinaPublic' ? 'jinaAPI' : 'jinaPublic'; + log(`🔄 Final fallback to ${finalService}...`); + + try { + const finalFallback = finalService === 'jinaAPI' ? + await extractWithJinaAPI(extractionURL, options) : + await extractWithJinaPublic(extractionURL, options); + + results.push(finalFallback); + + if (finalFallback.success && finalFallback.contentLength > 0) { + result = finalFallback; + log(`✅ Final fallback to ${finalService} successful`); + + // Smart 404 detection for logging and metrics + const detection404 = detect404Error(result.content); + if (detection404.detected) { + log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`); + result.finalFallback404Detection = detection404; + } + } else { + log(`❌ Final fallback to ${finalService} failed`); + } + } catch (error) { + log(`❌ Final fallback to ${finalService} failed with exception: ${error.message}`); + results.push({ + success: false, + error: { code: 'EXCEPTION', message: error.message }, + service: finalService, + url: extractionURL, + originalURL: url, + responseTime: Date.now() - startTime, + content: '', + contentLength: 0 + }); + } + } + + // Ultra-resilient fallback: Try pattern-based alternative approaches if all standard services failed + // Use smart 404 configuration to decide whether to attempt recovery + if (!result.success || result.contentLength === 0) { + // Get the best 404 detection result we have + let detection404 = result.fallback404Detection || result.finalFallback404Detection || { detected: false }; + + // If no content-based detection worked, try URL-based detection + if (!detection404.detected) { + detection404 = detect404FromURL(extractionURL); + } + + // Determine if we should try archive recovery + const shouldTry = shouldTryArchives(extractionURL, detection404, config404); + + if (shouldTry) { + log(`🚨 Trying ultra-resilient fallbacks with 404 configuration...`); + log(` 404 detected: ${detection404.detected} (source: ${detection404.source || 'content'}), Archive probability: ${config404.archiveProbability}`); + + // Pass 404 config to the ultra-resilient fallback system + const ultraResilientOptions = { + ...options, + config404, + maxArchiveAttempts: config404.maxArchiveAttempts + }; + + const ultraResilientResult = await tryUltraResilientFallbacks(extractionURL, ultraResilientOptions, results); + if (ultraResilientResult.success) { + result = ultraResilientResult.result; + results.push(ultraResilientResult.result); + log(`✅ Ultra-resilient fallback successful via ${ultraResilientResult.result.service}`); + } else { + log(`❌ Ultra-resilient fallbacks also failed`); + } + } else { + if (detection404.detected) { + log(`⏭️ Skipping ultra-resilient fallbacks (404 detected, configuration: ${config404.mode})`); + } else { + log(`⏭️ Skipping ultra-resilient fallbacks (disabled by configuration)`); + } + } + } + + const totalTime = Date.now() - startTime; + + // Return the successful result or the last attempted result + // But only consider it successful if at least one service actually worked + const hasAnySuccessfulService = results.some(r => r.success && (r.contentLength > 0 || useEnhancedMetadata)); + const successfulResult = hasAnySuccessfulService ? + results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata)) : + result; + + // Validate if the content is actually meaningful + const contentValidation = validateMeaningfulContent(successfulResult.content, successfulResult.service); + + // Detect 404 patterns for metrics and intelligent handling + const detection404 = detect404Error(successfulResult.content); + + // Determine honest success metrics + const technicalSuccess = successfulResult.success && successfulResult.contentLength > 0; + const meaningfulSuccess = technicalSuccess && contentValidation.isMeaningful; + const fallbackLevel = determineFallbackLevel(successfulResult.service, results.length); + + // Log content validation results for debugging + if (technicalSuccess && !meaningfulSuccess) { + log(`⚠️ Technical success but content validation failed:`); + log(` Reason: ${contentValidation.reason}${contentValidation.pattern ? ` (${contentValidation.pattern})` : ''}`); + log(` Source: ${contentValidation.source}`); + } else if (meaningfulSuccess) { + log(`✅ Meaningful content extracted successfully (${contentValidation.contentLength} chars)`); + } + + const finalResult = { + ...successfulResult, + // Legacy success field (for backwards compatibility) + success: hasAnySuccessfulService, + + // Enhanced success reporting + technicalSuccess, + meaningfulSuccess, + contentValidation, + fallbackLevel, + + totalAttempts: results.length, + totalResponseTime: totalTime, + strategy: { + isDocumentationSite: isDoc, + isProblematicDomain: isProblematic, + enhancedMetadataEnabled: useEnhancedMetadata, + primaryService: 'tavily', // ALWAYS Tavily first + fallbackService, + fallbackReason + }, + allResults: results, + metadata: { + ...successfulResult.metadata, + extractionStrategy: 'tavily_first_optimal_fallback', + timestamp: new Date().toISOString(), + totalTokensUsed: results.reduce((sum, r) => sum + (r.metadata?.tokenUsage || 0), 0), + urlValidation: { + originalURL: url, + normalizedURL: extractionURL, + wasNormalized: urlValidation.hasFixes, + validationIssues: urlValidation.issues, + validationMessage: urlValidation.message + }, + allServicesFailed: !hasAnySuccessfulService, + ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0, + attemptedServices: results.map(r => r.service), + successfulService: hasAnySuccessfulService ? results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata))?.service : null, + // New meaningful content metrics + honestSuccessMetrics: { + technicalSuccess, + meaningfulSuccess, + fallbackLevel, + contentQuality: contentValidation.isMeaningful ? 'meaningful' : 'useless', + contentIssues: contentValidation.isMeaningful ? null : { + reason: contentValidation.reason, + pattern: contentValidation.pattern, + source: contentValidation.source + }, + // 404 handling metrics + handling404: { + detected404: detection404?.detected || false, + fourOFourPatterns: (detection404 && detection404.patterns) ? detection404.patterns : [], + fourOFourConfidence: detection404?.confidence || 0, + attemptedArchives: shouldTryArchives(extractionURL, detection404 || { detected: false }, config404), + archiveMode: config404.mode, + archiveProbability: config404.archiveProbability, + maxArchiveAttempts: config404.maxArchiveAttempts, + isHighValueDomain: isHighValueDomain(extractionURL, config404), + isLowValueContent: isLowValueContent(extractionURL, config404) + } + } + } + }; + + // If all services failed, add appropriate error information + if (!hasAnySuccessfulService) { + finalResult.error = { + code: 'ALL_SERVICES_FAILED', + message: 'All extraction services failed to retrieve content', + attempts: results.length, + serviceResults: results.map(r => ({ service: r.service, success: r.success, error: r.error?.code })), + ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0 + }; + } + + return finalResult; +} + +/** + * Performs a search using the Tavily API with enhanced error handling + * @param {Object} params - Search parameters + * @param {number} timeoutMs - Request timeout in milliseconds + * @returns {Object} Search results + */ +export const tavily = { + search: async function tavilySearch(params, timeoutMs = 15000) { + const startTime = Date.now(); + + if (!TAVILY_API_KEY) { + throw new Error('Tavily API key not configured'); + } + + // Construct the request payload + const requestBody = { + api_key: TAVILY_API_KEY, + query: params.query, + max_results: params.maxResults || 5, + include_answer: params.includeAnswer !== false, // Default to true + include_raw_content: params.includeRawContent || false, + num_days: params.numDays || 30, // Look back 30 days by default + }; + + // Add headers if provided + const headers = { + 'Content-Type': 'application/json', + ...params.headers + }; + + try { + // Create AbortController for timeout handling + const controller = new AbortController(); + const timeoutId = setTimeout(timeoutMs, null).then(() => { + controller.abort(); + }); + + // Make the API request + const response = await fetch('https://api.tavily.com/search', { + method: 'POST', + headers, + body: JSON.stringify(requestBody), + signal: controller.signal + }); + + // Clear the timeout if the request completes in time + clearTimeout(timeoutId); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`); + } + + const data = await response.json(); + return data; + + } catch (error) { + if (error.name === 'AbortError') { + throw new Error(`Request timeout after ${timeoutMs}ms`); + } else if (error.code === 'ECONNREFUSED') { + throw new Error(`Connection refused when trying to reach Tavily API: ${error.message}`); + } else { + throw error; + } + } + } +}; + +/** + * Simple logging function (can be replaced with proper logging) + */ +function log(message) { + console.log(`[ContentExtractor] ${message}`); +} + +/** + * Batch content extraction for multiple URLs + */ +export async function extractContentBatch(urls, options = {}) { + const results = []; + const concurrency = options.concurrency || 3; + + log(`📦 Batch extracting ${urls.length} URLs with concurrency ${concurrency}`); + + for (let i = 0; i < urls.length; i += concurrency) { + const batch = urls.slice(i, i + concurrency); + const batchPromises = batch.map(url => extractContent(url, options)); + + const batchResults = await Promise.allSettled(batchPromises); + + batchResults.forEach((result, index) => { + const url = batch[index]; + if (result.status === 'fulfilled') { + results.push({ url, ...result.value }); + } else { + results.push({ + url, + success: false, + error: { code: 'BATCH_ERROR', message: result.reason.message }, + content: '', + contentLength: 0, + service: 'batch_failed' + }); + } + }); + + // Small delay between batches to be respectful to rate limits + if (i + concurrency < urls.length) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + const successCount = results.filter(r => r.success && (r.contentLength > 0 || options.enhancedMetadata)).length; + log(`✅ Batch extraction complete: ${successCount}/${urls.length} successful`); + + return { + results, + summary: { + total: urls.length, + successful: successCount, + failed: urls.length - successCount, + successRate: Math.round((successCount / urls.length) * 100) + } + }; +} + +export default { + extractContent, + extractContentBatch, + tavily, + SERVICES, + isDocumentationSite, + isProblematicDomain +}; \ No newline at end of file diff --git a/hooks/handle-rate-limit.mjs b/hooks/handle-rate-limit.mjs new file mode 100644 index 0000000..74496c8 --- /dev/null +++ b/hooks/handle-rate-limit.mjs @@ -0,0 +1,117 @@ +// hooks/handle-rate-limit.mjs +import contentExtractor from './content-extractor.mjs'; + +/** + * Handles rate limiting scenarios + * @param {Object} error - The rate limit error + * @param {Object} options - Search options + * @returns {Object} Results after handling rate limit + */ +export async function handleRateLimit(error, options) { + console.log('Handling rate limit error...'); + + // Extract retry-after header if available, or use default delay + let delay = 60000; // Default to 1 minute + + if (error.response && error.response.headers && error.response.headers['retry-after']) { + const retryAfter = parseInt(error.response.headers['retry-after'], 10); + if (!isNaN(retryAfter)) { + delay = retryAfter * 1000; // Convert to milliseconds + } + } + + // Apply jitter to avoid thundering herd problem + const jitter = Math.random() * 5000; // Up to 5 seconds + delay += jitter; + + console.log(`Waiting ${Math.round(delay/1000)} seconds before retrying due to rate limiting...`); + + try { + // Wait for the required time + await new Promise(resolve => setTimeout(resolve, delay)); + + // Try again with modified parameters to reduce load + const modifiedParams = { + ...options, + headers: generateRateLimitHeaders(), + maxResults: Math.max(1, Math.floor((options.maxResults || 5) / 2)) // Reduce number of results + }; + + const results = await contentExtractor.tavily.search(modifiedParams); + return { + success: true, + data: results, + message: 'Successfully retrieved results after handling rate limit' + }; + + } catch (retryError) { + // If still rate limited, try with even more conservative parameters + try { + // Wait an additional time + await new Promise(resolve => setTimeout(resolve, 120000)); // 2 minutes + + const conservativeParams = { + ...options, + headers: generateVeryConservativeHeaders(), + maxResults: 1, // Get just one result + query: simplifyQuery(options.query) + }; + + const results = await contentExtractor.tavily.search(conservativeParams); + return { + success: true, + data: results, + message: 'Successfully retrieved results with conservative approach after rate limiting' + }; + } catch (finalError) { + return { + error: true, + message: `Rate limit handling failed after multiple attempts: ${finalError.message}` + }; + } + } +} + +/** + * Generate headers that are less likely to trigger rate limits + * @returns {Object} Conservative headers + */ +function generateRateLimitHeaders() { + return { + 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'DNT': '1' + }; +} + +/** + * Generate very conservative headers + * @returns {Object} Very conservative headers + */ +function generateVeryConservativeHeaders() { + return { + 'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBot/1.0; +http://archive.org/details/archivebot)', + 'Accept': 'text/html', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'identity', // Don't request compression to reduce processing load + 'Connection': 'close', + 'Cache-Control': 'max-age=0' + }; +} + +/** + * Simplifies a query to reduce complexity + * @param {string} query - Original query + * @returns {string} Simplified query + */ +function simplifyQuery(query) { + // Remove complex terms that might trigger more intensive processing + return query + .replace(/\b(how to|guide to|tutorial for)\b/gi, '') + .replace(/\b(detailed|comprehensive|complete)\b/gi, '') + .trim(); +} \ No newline at end of file diff --git a/hooks/handle-search-error.mjs b/hooks/handle-search-error.mjs new file mode 100644 index 0000000..818d789 --- /dev/null +++ b/hooks/handle-search-error.mjs @@ -0,0 +1,887 @@ +// hooks/handle-search-error.mjs +import contentExtractor from './content-extractor.mjs'; +import { handleRateLimit } from './handle-rate-limit.mjs'; + +// ============================================================================ +// CONFIGURATION: Recovery strategy timeout +// ============================================================================ + +/** + * Recovery strategy timeout in milliseconds + * Environment variable: SEARCH_PLUS_RECOVERY_TIMEOUT_MS + * Default: 5000ms (5 seconds) - based on project requirements for <5s average recovery + */ +const RECOVERY_TIMEOUT_MS = validateRecoveryTimeout(process.env.SEARCH_PLUS_RECOVERY_TIMEOUT_MS || '5000'); + +/** + * Validates recovery timeout configuration value + * @param {string} value - The timeout value to validate + * @returns {number} Validated timeout in milliseconds + */ +function validateRecoveryTimeout(value) { + const parsed = parseInt(value, 10); + + // Check if value is a valid number + if (isNaN(parsed)) { + console.warn(`⚠️ Invalid SEARCH_PLUS_RECOVERY_TIMEOUT_MS: "${value}". Using default 5000ms.`); + return 5000; + } + + // Check for reasonable bounds (100ms to 60s) + if (parsed < 100) { + console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too low: ${parsed}ms. Minimum is 100ms. Using 100ms.`); + return 100; + } + + if (parsed > 60000) { + console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too high: ${parsed}ms. Maximum is 60000ms. Using 60000ms.`); + return 60000; + } + + return parsed; +} + +// Log configuration in development mode +if (process.env.NODE_ENV === 'development') { + console.log(`🔧 Search-Plus Recovery Timeout: ${RECOVERY_TIMEOUT_MS}ms`); +} + +/** + * Standardized error response helper + * @param {string} strategy - Name of the strategy that failed + * @param {Error|string} error - The error that occurred + * @param {number} startTime - Strategy start timestamp + * @param {Object} additionalInfo - Additional context info + * @returns {Object} Standardized error response + */ +function createStandardErrorResponse(strategy, error, startTime, additionalInfo = {}) { + const responseTime = Date.now() - startTime; + const errorMessage = error instanceof Error ? error.message : error; + + return { + success: false, + error: errorMessage, + strategy: strategy, + responseTime: responseTime, + timestamp: new Date().toISOString(), + ...additionalInfo + }; +} + +/** + * Standardized success response helper + * @param {string} strategy - Name of the strategy that succeeded + * @param {*} data - The data returned by the strategy + * @param {number} startTime - Strategy start timestamp + * @param {Object} additionalInfo - Additional context info + * @returns {Object} Standardized success response + */ +function createStandardSuccessResponse(strategy, data, startTime, additionalInfo = {}) { + const responseTime = Date.now() - startTime; + + return { + success: true, + data: data, + strategy: strategy, + responseTime: responseTime, + timestamp: new Date().toISOString(), + ...additionalInfo + }; +} + +/** + * Handles web search errors with advanced recovery strategies + * @param {Object} error - The error object + * @param {Object} options - Search options that caused the error + * @returns {Object} Recovery results or final error + */ +export async function handleWebSearchError(error, options) { + console.log('Handling search error:', error); + + // Check error type and apply appropriate recovery strategy + if (error.code === 403 || error.message.includes('403') || error.message.toLowerCase().includes('forbidden')) { + return await handle403Error(error, options); + } + else if (error.code === 451 || error.message.includes('451') || error.message.toLowerCase().includes('securitycompromise') || error.message.toLowerCase().includes('blocked until')) { + return await handle451SecurityError(error, options); + } + else if (error.code === 422 || error.message.includes('422') || is422SchemaError(error)) { + return await handle422Error(error, options); + } + else if (error.code === 429 || error.message.includes('429') || error.message.toLowerCase().includes('rate limit')) { + return await handleRateLimit(error, options); + } + else if (error.code === 'ECONNREFUSED' || error.message.toLowerCase().includes('connection refused')) { + return await handleConnectionRefusedError(error, options); + } + else if (error.code === 'ETIMEDOUT' || error.message.toLowerCase().includes('timeout')) { + return await handleTimeoutError(error, options); + } + else { + // For other errors, return the original error + return { + error: true, + message: `Search failed: ${error.message}`, + code: error.code + }; + } +} + +/** + * Handles 403 Forbidden errors + * @param {Object} error - The 403 error + * @param {Object} options - Search options + * @returns {Object} Recovery results + */ +async function handle403Error(error, options) { + console.log('Handling 403 error - trying with different headers...'); + + try { + // Try again with completely different headers + const modifiedParams = { + ...options, + headers: generateDiverseHeaders() + }; + + // Add a delay before retrying + await new Promise(resolve => setTimeout(resolve, 2000)); + + const results = await contentExtractor.tavily.search(modifiedParams); + return { + success: true, + data: results, + message: 'Successfully retrieved results after handling 403 error' + }; + + } catch (retryError) { + console.log('403 retry failed, trying alternative approach...'); + + // Try with a different search query formulation + try { + const reformulatedQuery = reformulateQuery(options.query); + const results = await contentExtractor.tavily.search({ ...options, query: reformulatedQuery }); + + return { + success: true, + data: results, + message: 'Successfully retrieved results with reformulated query after 403 error' + }; + } catch (finalError) { + return { + error: true, + message: `Failed to retrieve results after handling 403 error: ${finalError.message}` + }; + } + } +} + +/** + * Handles 451 SecurityCompromiseError (domain blocked due to abuse) + * Uses parallel execution with enhanced UX logging + * @param {Object} error - The 451 error + * @param {Object} options - Search options + * @returns {Object} Recovery results + */ +async function handle451SecurityError(error, options) { + const blockedDomain = extractBlockedDomain(error.message); + + // Simple mode for power users who want minimal output + if (process.env.SEARCH_PLUS_451_SIMPLE_MODE === 'true') { + return await handleSimple451Recovery(error, options, blockedDomain); + } + + // Enhanced UX logging by default + console.log('🚫 451 SecurityCompromiseError detected'); + console.log(`📍 Blocked domain: ${blockedDomain || 'unknown'}`); + console.log('🚀 Starting parallel recovery:'); + console.log(' 🛡️ Strategy 1: Domain exclusion'); + console.log(' 🔍 Strategy 2: Alternative sources'); + + // Optimized parallel execution using the two most effective strategies + const strategies = [ + searchWithExcludedDomainUnified(options, blockedDomain, true), + tryAlternativeSearchSources(options, true) + ]; + + try { + const results = await Promise.any(strategies); + console.log(`✅ Success! Used strategy: ${results.strategy} (${results.responseTime}ms)`); + + // Provide actionable suggestions for future searches + if (blockedDomain) { + console.log(`💡 Next time, try: /search-plus "${options.query} -site:${blockedDomain}"`); + } + + return { + success: true, + data: results.data, + message: `Successfully retrieved results using ${results.strategy} for blocked domain ${blockedDomain || 'unknown'}`, + strategy: results.strategy, + responseTime: results.responseTime, + blockedDomain: blockedDomain + }; + + } catch (aggregateError) { + // Enhanced error classification and user guidance + const failureType = classify451Failure(aggregateError, blockedDomain, options); + console.log(`❌ All recovery strategies failed`); + console.log(`🔍 Error type: ${failureType.type}`); + + if (failureType.suggestions.length > 0) { + console.log('💡 Suggestions:'); + failureType.suggestions.forEach((suggestion, i) => { + console.log(` ${i + 1}. ${suggestion.description}`); + }); + } + + return generateEnhancedErrorResponse(failureType, blockedDomain, options); + } +} + +/** + * Handles 451 errors in simple mode with minimal output + * @param {Object} error - The 451 error + * @param {Object} options - Search options + * @param {string} blockedDomain - The blocked domain + * @returns {Object} Recovery results + */ +async function handleSimple451Recovery(error, options, blockedDomain) { + console.log('⚡ 451 error - attempting recovery...'); + + const strategies = [ + searchWithExcludedDomainUnified(options, blockedDomain, true), + tryAlternativeSearchSources(options, true) + ]; + + try { + const results = await Promise.any(strategies); + console.log(`⚡ 451 recovered in ${results.responseTime}ms`); + return results; + } catch (aggregateError) { + console.log('❌ 451 recovery failed'); + return { + error: true, + message: `Failed to recover from 451 error. Domain ${blockedDomain || 'unknown'} is blocked.`, + blockedDomain: blockedDomain + }; + } +} + +/** + * Classifies 451 failure types for enhanced error handling + * @param {AggregateError} aggregateError - The combined error from failed strategies + * @param {string} blockedDomain - The blocked domain + * @param {Object} options - Original search options + * @returns {Object} Failure classification with suggestions + */ +function classify451Failure(aggregateError, blockedDomain, options) { + // Check for permanent block patterns + if (aggregateError.errors.some(err => err.message.includes('blocked until'))) { + return { + type: 'permanent-block', + suggestions: [ + { + type: 'ready-to-run', + command: `/search-plus "${options.query} -site:${blockedDomain}"`, + description: 'Exclude blocked domain and search again' + }, + { + type: 'manual-search', + url: `https://www.google.com/search?q=${encodeURIComponent(options.query)}`, + description: 'Search manually in external browser' + } + ], + autoSuggestion: { + message: 'For more predictable results, enable simple 451 handling?', + command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true', + benefit: 'Provides clear guidance instead of complex automation' + } + }; + } + + // Default classification + return { + type: 'recovery-failed', + suggestions: [ + { + type: 'ready-to-run', + command: `/search-plus "${options.query} -site:${blockedDomain}"`, + description: 'Try again excluding the blocked domain' + } + ], + autoSuggestion: { + message: 'Want simpler error handling?', + command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true', + benefit: 'Minimal output with focus on results' + } + }; +} + +/** + * Generates enhanced error response with actionable suggestions + * @param {Object} failureType - The classified failure type + * @param {string} blockedDomain - The blocked domain + * @param {Object} options - Original search options + * @returns {Object} Enhanced error response + */ +function generateEnhancedErrorResponse(failureType, blockedDomain, options) { + return { + error: true, + message: `Failed to retrieve results after handling 451 SecurityCompromiseError. Domain ${blockedDomain || 'unknown'} is blocked.`, + blockedDomain: blockedDomain, + failureType: failureType.type, + suggestions: failureType.suggestions, + autoSuggestion: failureType.autoSuggestion + }; +} + +/** + * Extracts the blocked domain from error message + * @param {string} errorMessage - The error message + * @returns {string|null} The blocked domain or null if not found + */ +function extractBlockedDomain(errorMessage) { + const domainMatch = errorMessage.match(/domain (\S+) blocked/i) || + errorMessage.match(/access to (\S+) blocked/i); + return domainMatch ? domainMatch[1] : null; +} + +/** + * Extracts the block expiration date from error message + * @param {string} errorMessage - The error message + * @returns {string|null} The block expiration date or null if not found + */ +function extractBlockUntilDate(errorMessage) { + // Look for "blocked until" followed by a date, capturing until the next reason or end + const dateMatch = errorMessage.match(/blocked until (.+?)(?:\s+due|$)/i); + return dateMatch ? dateMatch[1].trim() : null; +} + +/** + * Alternative search sources with configurable optimization level + * @param {Object} options - Original search options + * @param {boolean} optimized - Whether to use optimized timeouts for parallel execution + * @returns {Promise} Search results from alternative sources + */ +async function tryAlternativeSearchSources(options, optimized = false) { + const startTime = Date.now(); + const strategyName = 'alternative-search-sources'; + const timeout = optimized ? 1500 : RECOVERY_TIMEOUT_MS; + + try { + console.log(optimized ? '🔍 Trying alternative search sources...' : 'Trying alternative search sources...'); + const blockedDomain = optimized ? (options.blockedDomain || null) : (options.error ? extractBlockedDomain(options.error.message || '') : null); + const domainFilter = blockedDomain ? `-site:${blockedDomain}` : ''; + const modifiedQuery = `${options.query} ${domainFilter} alternative OR substitute OR replacement`.trim(); + const modifiedParams = { + ...options, + query: modifiedQuery, + include_answer: true, + max_results: Math.min(options.max_results || 10, 8) + }; + + if (optimized) { + // Create AbortController for proper timeout cleanup in optimized mode + const abortController = new AbortController(); + const timeoutId = setTimeout(() => { + abortController.abort(); + }, timeout); + + try { + const searchPromise = contentExtractor.tavily.search({ + ...modifiedParams, + signal: abortController.signal + }); + const results = await searchPromise; + clearTimeout(timeoutId); + + return createStandardSuccessResponse(strategyName, results, startTime); + } catch (searchError) { + if (searchError.name === 'AbortError') { + throw new Error('Strategy timeout'); + } + throw searchError; + } + } else { + // Standard mode with timeout promise + const strategyPromise = contentExtractor.tavily.search(modifiedParams); + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout); + }); + + return await Promise.race([strategyPromise, timeoutPromise]); + } + } catch (error) { + return createStandardErrorResponse(strategyName, error, startTime); + } +} + + +/** + * Domain exclusion search with configurable optimization level + * @param {Object} options - Original search options + * @param {string} blockedDomain - The blocked domain + * @param {boolean} optimized - Whether to use optimized timeouts for parallel execution + * @returns {Promise} Search results + */ +async function searchWithExcludedDomainUnified(options, blockedDomain, optimized = false) { + const startTime = Date.now(); + const strategyName = 'excluded-domain-search'; + const timeout = optimized ? 1000 : RECOVERY_TIMEOUT_MS; + + try { + if (!blockedDomain) { + return createStandardErrorResponse(strategyName, 'No blocked domain to exclude', startTime); + } + + console.log(optimized ? `🛡️ Excluding domain: ${blockedDomain}` : `Searching while excluding domain: ${blockedDomain}`); + const exclusionQuery = `${options.query} -site:${blockedDomain}`; + const modifiedParams = { + ...options, + query: exclusionQuery, + headers: generateDiverseHeaders() + }; + + if (optimized) { + // Create AbortController for proper timeout cleanup in optimized mode + const abortController = new AbortController(); + const timeoutId = setTimeout(() => { + abortController.abort(); + }, timeout); + + try { + const searchPromise = contentExtractor.tavily.search({ + ...modifiedParams, + signal: abortController.signal + }); + const results = await searchPromise; + clearTimeout(timeoutId); + + return createStandardSuccessResponse(strategyName, results, startTime); + } catch (searchError) { + if (searchError.name === 'AbortError') { + throw new Error('Strategy timeout'); + } + throw searchError; + } + } else { + // Standard mode with timeout promise and delay + const strategyPromise = (async () => { + await new Promise(resolve => setTimeout(resolve, 3000)); + const results = await contentExtractor.tavily.search(modifiedParams); + return createStandardSuccessResponse(strategyName, results, startTime); + })(); + + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout); + }); + + return await Promise.race([strategyPromise, timeoutPromise]); + } + } catch (error) { + return createStandardErrorResponse(strategyName, error, startTime); + } +} + + +/** + * Reformulates query to avoid references to blocked domains + * @param {Object} options - Original search options + * @param {string} blockedDomain - The blocked domain + * @returns {Object} Search results + */ +async function reformulateQueryAvoidingBlockedDomain(options, blockedDomain) { + const startTime = Date.now(); + const strategyName = 'reformulate-query'; + + const strategyPromise = (async () => { + try { + console.log('Reformulating query to avoid blocked domain references...'); + let reformulatedQuery = options.query; + if (blockedDomain) { + const domainMappings = { + 'httpbin.org': 'HTTP testing API endpoint service', + 'github.com': 'code repository platform', + 'stackoverflow.com': 'programming Q&A website', + 'medium.com': 'blogging platform' + }; + const genericTerm = domainMappings[blockedDomain] || 'online service'; + reformulatedQuery = options.query.replace(new RegExp(blockedDomain, 'gi'), genericTerm); + } + const modifiedParams = { ...options, query: reformulatedQuery, search_depth: "basic" }; + + await new Promise(resolve => setTimeout(resolve, 2500)); + const results = await contentExtractor.tavily.search(modifiedParams); + + return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime }; + } catch (error) { + return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime }; + } + })(); + + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve({ + success: false, + error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`, + strategy: strategyName, + responseTime: Date.now() - startTime + }), RECOVERY_TIMEOUT_MS); + }); + + return Promise.race([strategyPromise, timeoutPromise]); +} + +/** + * Attempts to use cached or archived results for blocked content + * @param {Object} options - Original search options + * @param {string} blockedDomain - The blocked domain + * @returns {Object} Search results + */ +async function useCachedOrArchiveResults(options, blockedDomain) { + const startTime = Date.now(); + const strategyName = 'archive-search'; + + const strategyPromise = (async () => { + try { + console.log('Searching for archived or cached content...'); + const archiveQuery = blockedDomain + ? `${options.query} web archive OR wayback machine OR cached version "site:${blockedDomain}"` + : `${options.query} archived OR cached OR mirror`; + const modifiedParams = { ...options, query: archiveQuery, max_results: Math.min(options.max_results || 10, 5) }; + + await new Promise(resolve => setTimeout(resolve, 4000)); + const results = await contentExtractor.tavily.search(modifiedParams); + + return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime }; + } catch (error) { + return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime }; + } + })(); + + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => resolve({ + success: false, + error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`, + strategy: strategyName, + responseTime: Date.now() - startTime + }), RECOVERY_TIMEOUT_MS); + }); + + return Promise.race([strategyPromise, timeoutPromise]); +} + +/** + * Handles connection refused errors + * @param {Object} error - The connection error + * @param {Object} options - Search options + * @returns {Object} Recovery results + */ +async function handleConnectionRefusedError(error, options) { + console.log('Handling connection refused error...'); + + try { + // Sometimes waiting and retrying works + await new Promise(resolve => setTimeout(resolve, 5000)); + + // Try with different parameters + const modifiedParams = { + ...options, + headers: generateDiverseHeaders(), + timeout: (options.timeout || 10000) + 5000 // Increase timeout + }; + + const results = await contentExtractor.tavily.search(modifiedParams); + return { + success: true, + data: results, + message: 'Successfully retrieved results after handling connection refused error' + }; + } catch (retryError) { + return { + error: true, + message: `Failed to retrieve results after handling connection refused error: ${retryError.message}` + }; + } +} + +/** + * Handles timeout errors + * @param {Object} error - The timeout error + * @param {Object} options - Search options + * @returns {Object} Recovery results + */ +async function handleTimeoutError(error, options) { + console.log('Handling timeout error...'); + + try { + // Retry with increased timeout and different headers + const modifiedParams = { + ...options, + headers: generateDiverseHeaders(), + timeout: Math.min((options.timeout || 10000) * 2, 30000) // Double timeout, max 30s + }; + + const results = await contentExtractor.tavily.search(modifiedParams); + return { + success: true, + data: results, + message: 'Successfully retrieved results after handling timeout error' + }; + } catch (retryError) { + return { + error: true, + message: `Failed to retrieve results after handling timeout error: ${retryError.message}` + }; + } +} + +/** + * Generate diverse headers to avoid detection + * @returns {Object} Diverse headers object + */ +function generateDiverseHeaders() { + const userAgents = [ + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + ]; + + const acceptLanguages = [ + 'en-US,en;q=0.9', + 'en-GB,en;q=0.9', + 'en-CA,en;q=0.9', + 'en-AU,en;q=0.9' + ]; + + return { + 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': acceptLanguages[Math.floor(Math.random() * acceptLanguages.length)], + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0' + }; +} + +/** + * Detects if error is a 422 schema validation error + * @param {Object} error - The error object + * @returns {boolean} True if this is a 422 schema error + */ +function is422SchemaError(error) { + const errorMessage = error.message || ''; + const errorString = JSON.stringify(error); + + // Check for common 422 schema validation patterns + const schemaErrorPatterns = [ + 'missing', + 'input_schema', + 'Field required', + 'unprocessable entity', + 'validation error', + 'schema validation', + 'invalid request format' + ]; + + return schemaErrorPatterns.some(pattern => + errorMessage.toLowerCase().includes(pattern) || + errorString.toLowerCase().includes(pattern) + ); +} + +/** + * Handles 422 Unprocessable Entity errors (schema validation) + * @param {Object} error - The 422 error + * @param {Object} options - Search options + * @returns {Object} Recovery results + */ +async function handle422Error(error, options) { + console.log('Handling 422 schema validation error...'); + + // Try multiple recovery strategies + const strategies = [ + () => repairSchemaAndRetry(options), + () => simplifyQueryAndRetry(options), + () => reformulateQueryForSchema(options), + () => tryAlternativeAPIFormat(options) + ]; + + for (const strategy of strategies) { + try { + console.log('Attempting 422 error recovery strategy...'); + const results = await strategy(); + if (results && !results.error) { + return { + success: true, + data: results, + message: 'Successfully retrieved results after handling 422 schema error' + }; + } + } catch (strategyError) { + console.log('422 recovery strategy failed:', strategyError.message); + continue; + } + } + + return { + error: true, + message: `Failed to retrieve results after handling 422 schema error: ${error.message}` + }; +} + +/** + * Attempts to repair schema issues and retry + * @param {Object} options - Original search options + * @returns {Object} Search results + */ +async function repairSchemaAndRetry(options) { + console.log('Attempting schema repair...'); + + // Add missing input_schema if this is the issue + const repairedParams = { + ...options, + input_schema: { + type: "web_search_20250305", + name: "web_search", + max_uses: 8 + } + }; + + // Add delay before retry + await new Promise(resolve => setTimeout(resolve, 1000)); + + return await contentExtractor.tavily.search(repairedParams); +} + +/** + * Simplifies the query to avoid schema validation issues + * @param {Object} options - Original search options + * @returns {Object} Search results + */ +async function simplifyQueryAndRetry(options) { + console.log('Simplifying query for schema compatibility...'); + + const simplifiedQuery = simplifyQueryForSchema(options.query); + const simplifiedParams = { + ...options, + query: simplifiedQuery, + max_results: Math.min(options.max_results || 10, 5), // Reduce complexity + search_depth: "basic" // Use simpler search mode + }; + + await new Promise(resolve => setTimeout(resolve, 1500)); + + return await contentExtractor.tavily.search(simplifiedParams); +} + +/** + * Reformulates query specifically for schema issues + * @param {Object} options - Original search options + * @returns {Object} Search results + */ +async function reformulateQueryForSchema(options) { + console.log('Reformulating query for schema compatibility...'); + + const reformulatedQuery = reformulateQueryForSchemaCompatibility(options.query); + const reformulatedParams = { + ...options, + query: reformulatedQuery, + include_answer: false, // Simplify request + include_raw_content: false + }; + + await new Promise(resolve => setTimeout(resolve, 2000)); + + return await contentExtractor.tavily.search(reformulatedParams); +} + +/** + * Tries alternative API format + * @param {Object} options - Original search options + * @returns {Object} Search results + */ +async function tryAlternativeAPIFormat(options) { + console.log('Trying alternative API format...'); + + // Try with minimal parameters + const minimalParams = { + query: options.query, + api_key: options.api_key, + search_depth: "basic" + }; + + await new Promise(resolve => setTimeout(resolve, 3000)); + + return await contentExtractor.tavily.search(minimalParams); +} + +/** + * Simplifies query for schema compatibility + * @param {string} query - Original query + * @returns {string} Simplified query + */ +function simplifyQueryForSchema(query) { + return query + .replace(/\s+/g, ' ') // Normalize whitespace + .replace(/[^\w\s\-.,!?]/g, '') // Remove special characters except basic punctuation + .substring(0, 200) // Limit length + .trim(); +} + +/** + * Reformulates query specifically for schema compatibility issues + * @param {string} query - Original query + * @returns {string} Reformulated query + */ +function reformulateQueryForSchemaCompatibility(query) { + // Break down complex queries into simpler components + const words = query.split(' ').filter(word => word.length > 2); + if (words.length > 8) { + // If query is too long, use the most important terms + return words.slice(0, 6).join(' '); + } + + // Replace problematic patterns + return query + .replace(/\d{4}/g, '') // Remove years + .replace(/github|gitlab|bitbucket/gi, 'code repository') // Replace specific platforms + .replace(/open source|open-source/gi, 'free software') // Simplify terminology + .replace(/platform|boilerplate|framework/gi, 'software') // Generic terms + .trim(); +} + +/** + * Reformulates a query to potentially bypass filters + * @param {string} query - Original query + * @returns {string} Reformulated query + */ +function reformulateQuery(query) { + // Simple reformulation - could be enhanced with more sophisticated NLP + const synonyms = { + 'how to': 'guide for', + 'what is': 'information about', + 'why is': 'reason for', + 'when did': 'date of' + }; + + let reformulated = query; + for (const [original, replacement] of Object.entries(synonyms)) { + reformulated = reformulated.replace(new RegExp(original, 'gi'), replacement); + } + + return reformulated; +} + +// Export additional functions for testing +export { + classify451Failure, + validateRecoveryTimeout, + createStandardErrorResponse, + createStandardSuccessResponse +}; \ No newline at end of file diff --git a/hooks/handle-web-search.mjs b/hooks/handle-web-search.mjs new file mode 100644 index 0000000..fc43cc5 --- /dev/null +++ b/hooks/handle-web-search.mjs @@ -0,0 +1,458 @@ +// hooks/handle-web-search.mjs +import { tavily, extractContent } from './content-extractor.mjs'; +import { handleWebSearchError } from './handle-search-error.mjs'; + +// Configuration for environment variable namespacing +const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null; +const JINAAI_API_KEY = process.env.SEARCH_PLUS_JINAAI_API_KEY || process.env.JINAAI_API_KEY || null; + +// Show deprecation warnings for old variable names +if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) { + console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY'); +} +if (!process.env.SEARCH_PLUS_JINAAI_API_KEY && process.env.JINAAI_API_KEY) { + console.warn('⚠️ JINAAI_API_KEY is deprecated. Please update to SEARCH_PLUS_JINAAI_API_KEY'); +} + +/** + * Detects if the input is a URL + * @param {string} input - The input to check + * @returns {boolean} True if the input is a URL + */ +function isURL(input) { + try { + const url = new URL(input); + return url.protocol === 'http:' || url.protocol === 'https:'; + } catch { + return false; + } +} + +/** + * Handles web search requests with enhanced error handling + * @param {Object} params - Search parameters + * @returns {Object} Search results or error information + */ +export async function handleWebSearch(params) { + const query = params.query || params.q || ''; + const maxRetries = params.maxRetries || 3; + const timeout = params.timeout || 10000; // 10 seconds default + + if (!query) { + return { + error: true, + message: 'No search query or URL provided' + }; + } + + // Check if the query is a URL and handle extraction + if (isURL(query)) { + console.log(`🔍 Extracting content from URL: ${query}`); + const result = await handleURLExtraction(query, { maxRetries, timeout }); + + // Provide brief status feedback + if (result.success) { + console.log(`✅ URL extraction completed successfully`); + } else { + console.log(`❌ URL extraction failed: ${result.message}`); + } + + return result; + } + + // Provide status feedback for search queries + if (!isURL(query)) { + console.log(`🔍 Searching: ${query}`); + } + + // Use hybrid search strategy + try { + const searchParams = { + query, + maxResults: params.maxResults || 5, + includeAnswer: params.includeAnswer !== false, + includeRawContent: params.includeRawContent || false, + headers: generateRandomHeaders() + }; + + const result = await performHybridSearch(searchParams, timeout); + + return { + success: true, + data: result.data, + service: result.service, + attempt: 1 + }; + + } catch (error) { + console.error('All search strategies failed:', error.message); + + // Final error handling for recovery attempts + const errorResult = await handleWebSearchError(error, { + query, + maxResults: params.maxResults || 5, + includeAnswer: params.includeAnswer || true, + includeRawContent: params.includeRawContent || false, + headers: generateRandomHeaders(), + timeout, + attempt: 1, + error: error + }); + + if (errorResult && errorResult.success) { + return { + success: true, + data: errorResult.data, + attempt: 1, + errorRecovered: true, + originalError: error.message, + recoveryMessage: errorResult.message + }; + } + + return { + error: true, + message: errorResult?.message || error.message, + attempt: 1, + errorHandlingApplied: true + }; + } +} + +/** + * Hybrid web search with intelligent service selection + * Sequential: Tavily → Parallel free services + * Note: Jina API is only used for URL extraction, not web search + */ +async function performHybridSearch(params, timeoutMs = 10000) { + // Phase 1: Try Tavily API (premium service) + if (TAVILY_API_KEY) { + try { + console.log('🚀 Trying Tavily API...'); + const result = await tavily.search(params, timeoutMs); + return { data: result, service: 'tavily' }; + } catch (error) { + console.log('🔄 Tavily failed, trying free services...'); + } + } + + // Phase 2: Parallel execution for free services + console.log('🌐 Trying all free search engines in parallel...'); + const freeStrategies = [ + trySearXNGSearch(params, timeoutMs), + tryDuckDuckGoHTML(params, timeoutMs), + tryStartpageHTML(params, timeoutMs) + ]; + + try { + const result = await Promise.any(freeStrategies); + console.log(`✅ Success with free service: ${result.service}`); + return result; + } catch (aggregateError) { + throw new Error('All search services failed. Try again or configure Tavily API key for enhanced reliability.'); + } +} + + +/** + * Attempts search using SearXNG metasearch engine + */ +async function trySearXNGSearch(params, timeoutMs = 10000) { + const searxngInstances = [ + 'https://search.brave.works', + 'https://searx.be', + 'https://searx.tiekoetter.com', + 'https://search.snopyta.org' + ]; + + const query = encodeURIComponent(params.query); + const maxResults = params.maxResults || 5; + + for (const instance of searxngInstances) { + try { + const searchUrl = `${instance}/search?q=${query}&format=json&engines=google,duckduckgo,startpage&results=${maxResults}`; + + const response = await fetch(searchUrl, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Referer': instance, + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + ...params.headers + }, + signal: AbortSignal.timeout(timeoutMs) + }); + + if (!response.ok) { + continue; // Try next instance + } + + const data = await response.json(); + + if (!data.results || data.results.length === 0) { + continue; // Try next instance + } + + // Transform SearXNG results to Tavily-like format + const transformedResults = { + results: data.results.slice(0, maxResults).map((item, index) => ({ + title: item.title, + url: item.url, + content: item.content || '', + score: 1.0 - (index * 0.1), // Simple scoring + published_date: item.publishedDate || null + })), + answer: data.answers?.[0] || null, + query: params.query, + response_time: Date.now() - performance.now() + }; + + return { data: transformedResults, service: 'searxng' }; + + } catch (error) { + console.log(`❌ SearXNG instance ${instance} failed: ${error.message}`); + continue; // Try next instance + } + } + + throw new Error('All SearXNG instances failed'); +} + +/** + * Attempts search using DuckDuckGo HTML parsing + */ +async function tryDuckDuckGoHTML(params, timeoutMs = 10000) { + const query = encodeURIComponent(params.query); + const maxResults = params.maxResults || 5; + + const searchUrl = `https://html.duckduckgo.com/html/?q=${query}&kl=us-en`; + + const response = await fetch(searchUrl, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + ...params.headers + }, + signal: AbortSignal.timeout(timeoutMs) + }); + + if (!response.ok) { + throw new Error(`DuckDuckGo HTML error: ${response.status}`); + } + + const html = await response.text(); + + // Parse HTML results + const results = []; + const resultRegex = /
[\s\S]*?([^<]+)<\/a>[\s\S]*?([^<]*)<\/a>/g; + + let match; + while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) { + const [, url, title, snippet] = match; + + if (url && title && !url.includes('//r.jina.ai/http')) { // Filter out redirect links + results.push({ + title: title.trim(), + url: url.startsWith('http') ? url : `https:${url}`, + content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '', + score: 1.0 - (results.length * 0.1) + }); + } + } + + if (results.length === 0) { + throw new Error('No results found in DuckDuckGo HTML response'); + } + + const transformedResults = { + results, + answer: null, // DuckDuckGo doesn't provide instant answers in HTML mode + query: params.query, + response_time: Date.now() - performance.now() + }; + + return { data: transformedResults, service: 'duckduckgo-html' }; +} + +/** + * Attempts search using Startpage HTML parsing + */ +async function tryStartpageHTML(params, timeoutMs = 10000) { + const query = encodeURIComponent(params.query); + const maxResults = params.maxResults || 5; + + const searchUrl = `https://www.startpage.com/do/search?query=${query}&cat=web&pl=ext-ff&extVersion=1.3.0`; + + const response = await fetch(searchUrl, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + ...params.headers + }, + signal: AbortSignal.timeout(timeoutMs) + }); + + if (!response.ok) { + throw new Error(`Startpage HTML error: ${response.status}`); + } + + const html = await response.text(); + + // Parse HTML results (Startpage format) + const results = []; + const resultRegex = /

]*>([^<]+)<\/a><\/h3>[\s\S]*?

([^<]*)<\/p>/g; + + let match; + while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) { + const [, url, title, snippet] = match; + + if (url && title) { + results.push({ + title: title.trim(), + url: url.startsWith('http') ? url : `https:${url}`, + content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '', + score: 1.0 - (results.length * 0.1) + }); + } + } + + if (results.length === 0) { + throw new Error('No results found in Startpage HTML response'); + } + + const transformedResults = { + results, + answer: null, + query: params.query, + response_time: Date.now() - performance.now() + }; + + return { data: transformedResults, service: 'startpage-html' }; +} + +/** + * Generate random headers to avoid detection + * @returns {Object} Random headers object + */ +function generateRandomHeaders() { + const userAgents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0' + ]; + + return { + 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }; +} + +/** + * Determines if an error is retryable + * @param {Error} error - The error to check + * @returns {boolean} True if the error is retryable + */ +function isRetryableError(error) { + // 403, 422, 429, 451, ECONNREFUSED, ETIMEDOUT are retryable + const errorMessage = error.message || ''; + const errorString = JSON.stringify(error); + + return error.code === 403 || + error.code === 422 || + error.code === 429 || + error.code === 451 || + error.code === 'ECONNREFUSED' || + error.code === 'ETIMEDOUT' || + errorMessage.includes('403') || + errorMessage.includes('422') || + errorMessage.includes('429') || + errorMessage.includes('451') || + errorMessage.includes('SecurityCompromiseError') || + errorMessage.includes('blocked until') || + errorMessage.includes('ECONNREFUSED') || + errorMessage.includes('ETIMEDOUT') || + // Check for schema validation patterns + errorString.toLowerCase().includes('missing') || + errorString.toLowerCase().includes('input_schema') || + errorString.toLowerCase().includes('field required'); +} + +/** + * Handles URL extraction with retry logic + * @param {string} url - The URL to extract content from + * @param {Object} options - Extraction options + * @returns {Object} Extraction results or error information + */ +async function handleURLExtraction(url, options = {}) { + const { maxRetries = 3, timeout = 15000 } = options; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + // Add random delay to avoid rate limiting + if (attempt > 0) { + const delay = Math.min(1000 * Math.pow(2, attempt), 8000); // Exponential backoff up to 8s + await new Promise(resolve => setTimeout(resolve, delay)); + } + + // Try to extract content with custom headers + const extractOptions = { + headers: generateRandomHeaders(), + includeImages: false, // Don't include images by default for faster processing + ...options + }; + + const results = await extractContent(url, extractOptions); + + return { + success: true, + data: results, + attempt: attempt + 1, + isURLExtraction: true + }; + + } catch (error) { + console.error(`URL extraction attempt ${attempt + 1} failed:`, error.message); + + // Check if it's a retryable error + if (attempt === maxRetries || !isRetryableError(error)) { + return { + error: true, + message: `Failed to extract content from URL: ${error.message}`, + attempt: attempt + 1, + isURLExtraction: true + }; + } + + // Continue to next attempt + } + } +} \ No newline at end of file diff --git a/hooks/hooks.json b/hooks/hooks.json new file mode 100644 index 0000000..ec9740a --- /dev/null +++ b/hooks/hooks.json @@ -0,0 +1,16 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "WebSearch|WebFetch", + "hooks": [ + { + "type": "command", + "command": "node ${CLAUDE_PLUGIN_ROOT}/hooks/handle-web-search.mjs", + "timeout": 30 + } + ] + } + ] + } +} \ No newline at end of file diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..e8ccd71 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,73 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:shrwnsan/vibekit-claude-plugins:plugins/search-plus", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "1047e286a3e72f88d722111115f1fa5a1e5ee054", + "treeHash": "eb8fcc01a8e076e99e001571d97f408dd3816111d7506765f358eb4af324f7a9", + "generatedAt": "2025-11-28T10:28:20.330026Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "search-plus", + "description": "Enhanced web search with multi-service fallback architecture (Tavily + Jina.ai) and comprehensive error handling", + "version": "2.7.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "032993abd3e8e5d59ff81871b97a172782acb7a9b336b90c5793bf3a7c390c44" + }, + { + "path": "agents/search-plus.md", + "sha256": "e41db8f1cabce245a70872441f555850b01ec605792fd075b54fe1c0a05a8fea" + }, + { + "path": "hooks/handle-search-error.mjs", + "sha256": "67fc8854bd437d18d67493c5a9d1c74b1e101eee803036f6d896f4c9d9b3f431" + }, + { + "path": "hooks/content-extractor.mjs", + "sha256": "67ac83b223ef83430992cb97f82bddd2be53e683647d052cf149ccaadc2d2488" + }, + { + "path": "hooks/handle-rate-limit.mjs", + "sha256": "dd63c10698d954a8c36f0f6fc991c5b30bac2c5e4ce95ca254bf53b3788300c0" + }, + { + "path": "hooks/hooks.json", + "sha256": "0d64751625cff4e00b9103a887650b791e4ab3663c36fe4b9a09535b2d15c25a" + }, + { + "path": "hooks/handle-web-search.mjs", + "sha256": "e834f37bc486f37db4617ffa321b0fb388a57280c15114bc2ce1c9be019142de" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "a627b28135f417cc2c5d1c0e7484ea416cc244bc4f367ad9aa21e0d34bd3b011" + }, + { + "path": "commands/search-plus.md", + "sha256": "554526f9b69f9444182667e3945138267b5b7606c7d8326b1f3f5b391f08d429" + }, + { + "path": "skills/search-plus/SKILL.md", + "sha256": "354fc85683ee2fd2a91f741d1a4aa53a2ad24ea2c8498ce7c013c0a545f7d37c" + } + ], + "dirSha256": "eb8fcc01a8e076e99e001571d97f408dd3816111d7506765f358eb4af324f7a9" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/search-plus/SKILL.md b/skills/search-plus/SKILL.md new file mode 100644 index 0000000..53f7f93 --- /dev/null +++ b/skills/search-plus/SKILL.md @@ -0,0 +1,72 @@ +--- +name: meta-searching +description: Extracts web content and performs reliable searches when standard tools fail due to access restrictions, rate limiting, or validation errors. Use when encountering 403/429/422 errors, blocked documentation sites, or silent search failures. **Direct tool execution often fails for these scenarios - search-plus agent delegation provides the most reliable results.** +allowed-tools: + - web_search + - web_fetch +--- + +# Meta Searching + +Advanced federated web search that overcomes access restrictions, rate limiting, and validation errors by intelligently combining multiple search services. + +## When to Use + +**Use this skill when you encounter:** +- 403 Forbidden errors from documentation sites or APIs +- 429 Rate Limited responses during research or documentation analysis +- 422 validation errors from web services +- Silent failures where standard search returns empty results or times out +- Need to extract specific content from blocked URLs or paywalled sites + +**This skill provides specialized error handling and multi-service extraction when standard tools fail.** + +## Capabilities + +### Multi-Service Intelligence +- **Federated Search**: Combines Tavily Extract API with Jina.ai fallback for 100% reliability +- **Smart Service Selection**: Automatically chooses optimal service based on content type and domain characteristics +- **Zero Single Point of Failure**: Multiple service providers guarantee reliable results + +### Error Resolution +- **403 Forbidden**: Resolves access restrictions using alternative extraction methods +- **429 Rate Limited**: Handles rate limiting with intelligent retry strategies +- **422 Validation**: Fixes schema validation issues through request adaptation +- **Timeout Prevention**: Eliminates "Did 0 searches..." responses and empty results + +### Content Access +- **Direct URL Extraction**: Extracts content from blocked documentation sites, articles, and repositories +- **Format Preservation**: Maintains document structure, code formatting, and markdown +- **Intelligent Fallback**: Switches between services when primary approaches fail + +## Examples + +### Documentation Research +``` +"Extract content from the Claude Code documentation at https://docs.anthropic.com/en/docs/claude-code" +"Research web scraping best practices from documentation that blocks access" +"Analyze this GitHub repository's README: https://github.com/example/repo" +``` + +### Error Recovery Scenarios +``` +"This website is blocking access with 403 errors, extract the content" +"Search failed with rate limiting, retry with enhanced error handling" +"Getting 422 validation errors, resolve and extract the information" +"Standard search returned no results, try enhanced extraction methods" +``` + +### Content Extraction +``` +"Extract and summarize the technical article at this URL" +"Get information from documentation sites that typically block access" +"Research current information that standard tools cannot reach" +``` + +## Limitations + +- Requires internet connectivity and API configuration +- Slower than basic search due to comprehensive error handling (2-3x longer) +- Some paywalled content may remain inaccessible +- Cannot bypass CAPTCHA or advanced bot protection +- May not work with sites requiring JavaScript execution \ No newline at end of file