// hooks/content-extractor.mjs import { setTimeout } from 'timers/promises'; import { promises as dns } from 'dns'; import net from 'net'; /** * Enhanced Content Extractor with Service Selection Strategy * * Implements optimal fallback strategy based on comprehensive testing: * Primary: Tavily Extract API (100% success rate, 863ms avg) - FASTEST AND MOST RELIABLE * Fallback: Jina.ai Public Endpoint (75% success rate, 1,066ms avg) - Good for documentation * Optional: Jina.ai API (88% success rate, 2,331ms avg) - Slower, for cost tracking only */ // Scalable fallback service definitions const FALLBACK_SERVICES = { cacheServices: [ { name: 'Google Web Cache', pattern: (url) => `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`, timeout: 15000, priority: 1, notes: 'Google web cache - fastest but sometimes blocked' }, { name: 'Internet Archive JSON API', pattern: async (url) => { try { const response = await fetch(`https://archive.org/wayback/available?url=${encodeURIComponent(url)}`, { timeout: 10000, headers: { 'Accept': 'application/json' } }); const data = await response.json(); if (data.archived_snapshots?.closest?.available) { return data.archived_snapshots.closest.url; } return null; } catch (error) { return null; } }, timeout: 15000, priority: 2, notes: 'Archive.org official API - most reliable for older content' }, { name: 'Internet Archive Direct', pattern: (url) => `https://web.archive.org/web/2/${encodeURIComponent(url)}`, timeout: 20000, priority: 3, notes: 'Direct archive.org access' }, { name: 'Bing Cache', pattern: (url) => `https://cc.bingj.com/cache.aspx?d=&w=${encodeURIComponent(url)}`, timeout: 20000, priority: 4, notes: 'Microsoft Bing cache - alternative to Google' }, { name: 'Yandex Turbo', pattern: (url) => `https://yandex.com/turbo?text=${encodeURIComponent(url)}`, timeout: 15000, priority: 5, notes: 'Yandex turbo mode - often good for news/blog content' } ], jinaFormats: [ { name: 'Standard', pattern: (url) => url, timeout: 10000 }, { name: 'Double Redirect', pattern: (url) => `https://r.jina.ai/http://${encodeURIComponent(url)}`, timeout: 12000 }, { name: 'Triple Redirect', pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://${encodeURIComponent(url)}`, timeout: 15000 }, { name: 'Text Extractor', pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://textise dot iitty?url=${encodeURIComponent(url)}`, timeout: 10000 } ], userAgents: [ { name: 'Chrome Browser', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0' }, timeout: 30000 }, { name: 'cURL', headers: { 'User-Agent': 'curl/8.0.0', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 20000 }, { name: 'Python Requests', headers: { 'User-Agent': 'python-requests/2.31.0', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 15000 }, { name: 'Wget', headers: { 'User-Agent': 'Wget/1.21.3', 'Accept': '*/*', 'Accept-Encoding': 'identity' }, timeout: 25000 } ] }; // Service configuration with fallback for backward compatibility const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null; const JINA_API_KEY = process.env.SEARCH_PLUS_JINA_API_KEY || process.env.JINA_API_KEY || null; // Show deprecation warning if using old variables if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) { console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY'); } if (!process.env.SEARCH_PLUS_JINA_API_KEY && process.env.JINA_API_KEY) { console.warn('⚠️ JINA_API_KEY is deprecated. Please update to SEARCH_PLUS_JINA_API_KEY'); } const TAVILY_EXTRACT_URL = 'https://api.tavily.com/extract'; const JINA_READER_PUBLIC_URL = 'https://r.jina.ai/'; const JINA_READER_API_URL = 'https://r.jina.ai/'; // Service configuration based on research findings const SERVICES = { tavily: { name: 'Tavily Extract API', url: TAVILY_EXTRACT_URL, successRate: 100, avgResponseTime: 863, cost: 'paid', requiresAuth: true, bestFor: ['general', 'problematic_domains', 'financial', 'social_media', 'primary_choice'] }, jinaPublic: { name: 'Jina.ai Public Reader', url: JINA_READER_PUBLIC_URL, successRate: 75, avgResponseTime: 1066, cost: 'free', requiresAuth: false, bestFor: ['documentation', 'api_docs', 'technical_content'] }, jinaAPI: { name: 'Jina.ai API Reader', url: JINA_READER_API_URL, successRate: 88, avgResponseTime: 2331, cost: 'free', requiresAuth: true, bestFor: ['enhanced_metadata', 'reliability'] // 2.7x slower - provides detailed analytics } }; /** * Determines if a URL is likely to be documentation-heavy * Based on research showing Jina.ai excels at documentation extraction */ function isDocumentationSite(url) { const docPatterns = [ /docs?\./, /documentation/, /api.*docs/, /developer/, /reference/, /guide/, /tutorial/, /swagger/, /openapi/, /postman/, /readthedocs/, /gitbook/ ]; return docPatterns.some(pattern => pattern.test(url.toLowerCase())); } /** * Determines if a URL is likely to be problematic for direct access * Based on research showing Tavily handles these domains better */ function isProblematicDomain(url) { const problematicPatterns = [ /reddit\.com/, /finance\.yahoo\.com/, /twitter\.com/, /facebook\.com/, /instagram\.com/, /linkedin\.com/, /medium\.com/, /news\./, /coingecko\.com/, /binance\.com/ ]; return problematicPatterns.some(pattern => pattern.test(url.toLowerCase())); } /** * Validates Tavily API key with a simple test call */ async function validateTavilyAPIKey() { if (!TAVILY_API_KEY) { return { valid: false, reason: 'API key not configured' }; } try { const testResponse = await fetch('https://api.tavily.com/search', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ api_key: TAVILY_API_KEY, query: 'test', max_results: 1 }), signal: AbortSignal.timeout(5000) }); if (testResponse.status === 401 || testResponse.status === 403) { const errorData = await testResponse.json().catch(() => ({})); return { valid: false, reason: `Invalid API key: ${errorData.detail?.error || 'Unauthorized'}` }; } return { valid: true }; } catch (error) { return { valid: false, reason: `API key validation failed: ${error.message}` }; } } /** * Extracts content using Tavily Extract API */ async function extractWithTavily(url, options = {}, timeoutMs = 15000) { const startTime = Date.now(); if (!TAVILY_API_KEY) { throw new Error('Tavily API key not configured'); } const requestBody = { api_key: TAVILY_API_KEY, urls: [url.trim()] }; // Add optional parameters if (options.includeImages) requestBody.include_images = options.includeImages; if (options.extractDepth) requestBody.extract_depth = options.extractDepth; try { const controller = new AbortController(); const timeoutId = setTimeout(timeoutMs, null).then(() => controller.abort()); const response = await fetch(TAVILY_EXTRACT_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', ...options.headers }, body: JSON.stringify(requestBody), signal: controller.signal }); clearTimeout(timeoutId); if (!response.ok) { const errorData = await response.json().catch(() => ({})); throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`); } const data = await response.json(); const content = data.results && data.results[0] ? data.results[0].content || data.results[0].raw_content : ''; return { success: true, content, contentLength: content.length, service: 'tavily', url, responseTime: Date.now() - startTime, metadata: { service: SERVICES.tavily, responseData: data, hasResults: data.results && data.results.length > 0, title: data.results && data.results[0] ? data.results[0].title : null } }; } catch (error) { return { success: false, error: { code: extractErrorCode(error.message), message: error.message }, service: 'tavily', url, responseTime: Date.now() - startTime, content: '', metadata: { service: SERVICES.tavily, errorType: error.name } }; } } /** * Extracts content using Jina.ai Public Endpoint */ async function extractWithJinaPublic(url, options = {}, timeoutMs = 10000) { const startTime = Date.now(); try { const jinaUrl = `${JINA_READER_PUBLIC_URL}${url}`; const response = await fetch(jinaUrl, { method: 'GET', headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Search-Plus-Content-Extractor/1.0)', ...options.headers }, signal: AbortSignal.timeout(timeoutMs) }); if (!response.ok) { const errorText = await response.text(); throw new Error(`Jina.ai Public error: ${response.status} - ${errorText}`); } const content = await response.text(); return { success: true, content, contentLength: content.length, service: 'jinaPublic', url, responseTime: Date.now() - startTime, metadata: { service: SERVICES.jinaPublic, responseStatus: response.status, contentType: response.headers.get('content-type') } }; } catch (error) { return { success: false, error: { code: extractErrorCode(error.message), message: error.message }, service: 'jinaPublic', url, responseTime: Date.now() - startTime, content: '', metadata: { service: SERVICES.jinaPublic, errorType: error.name } }; } } /** * Extracts content using Jina.ai API (provides enhanced metadata and reliability) */ async function extractWithJinaAPI(url, options = {}, timeoutMs = 10000) { const startTime = Date.now(); if (!JINA_API_KEY) { throw new Error('Jina.ai API key not configured'); } try { const response = await fetch(JINA_READER_API_URL, { method: 'POST', headers: { 'Authorization': `Bearer ${JINA_API_KEY}`, 'Content-Type': 'application/json', 'Accept': 'application/json', ...options.headers }, body: JSON.stringify({ url: url, ...options.jinaOptions }), signal: AbortSignal.timeout(timeoutMs) }); if (!response.ok) { const errorText = await response.text(); throw new Error(`Jina.ai API error: ${response.status} - ${errorText}`); } const data = await response.json(); const content = data.data?.content || data.content || data.data || JSON.stringify(data); return { success: true, content, contentLength: content.length, service: 'jinaAPI', url, responseTime: Date.now() - startTime, metadata: { service: SERVICES.jinaAPI, responseData: data, tokenUsage: data.meta?.usage?.tokens || data.usage?.tokens, title: data.data?.title } }; } catch (error) { return { success: false, error: { code: extractErrorCode(error.message), message: error.message }, service: 'jinaAPI', url, responseTime: Date.now() - startTime, content: '', metadata: { service: SERVICES.jinaAPI, errorType: error.name } }; } } /** * Scalable ultra-resilient fallback using pattern-based services */ async function tryUltraResilientFallbacks(url, originalOptions, results) { log(`🚨 All standard services failed, trying ultra-resilient fallbacks...`); // Try 1: Enhanced Tavily with different user agents if (!originalOptions.triedEnhancedParams && (!results.find(r => r.error?.message?.includes('Unauthorized')))) { log(`🔧 Trying enhanced Tavily with different user agents...`); for (const userAgent of FALLBACK_SERVICES.userAgents.slice(0, 2)) { // Try top 2 user agents try { const enhancedResult = await extractWithTavily(url, { ...originalOptions, triedEnhancedParams: true, ...userAgent }); results.push(enhancedResult); if (enhancedResult.success && enhancedResult.contentLength > 0) { log(`✅ Enhanced Tavily (${userAgent.name}) extraction successful!`); return { success: true, result: enhancedResult }; } } catch (error) { log(`❌ Enhanced Tavily (${userAgent.name}) failed: ${error.message}`); } } } // Try 2: Enhanced cache services (with async pattern support and prioritization) if (!originalOptions.triedCacheServices) { log(`🕐️ Trying enhanced cache services...`); // Get max archive attempts from configuration (default to all if not specified) const maxAttempts = originalOptions.maxArchiveAttempts || FALLBACK_SERVICES.cacheServices.length; // Sort by priority and limit attempts const sortedCacheServices = [...FALLBACK_SERVICES.cacheServices] .sort((a, b) => a.priority - b.priority) .slice(0, maxAttempts); log(` Will try up to ${maxAttempts} cache services out of ${FALLBACK_SERVICES.cacheServices.length} available`); for (const cacheService of sortedCacheServices) { try { let cacheURL; // Handle async pattern functions (like Internet Archive API) if (typeof cacheService.pattern === 'function' && cacheService.constructor.name === 'AsyncFunction') { cacheURL = await cacheService.pattern(url); if (!cacheURL) { log(`⚠️ ${cacheService.name}: No cached version available`); continue; } } else { cacheURL = cacheService.pattern(url); } log(`🔍 Trying ${cacheService.name}: ${cacheURL.substring(0, 100)}...`); const cacheResult = await extractWithJinaPublic(cacheURL, { ...originalOptions, triedCacheServices: true, timeout: cacheService.timeout }); // Override service name to correctly identify which cache service was used if (cacheResult.success) { cacheResult.service = cacheService.name; cacheResult.metadata.service = cacheService.name; } results.push(cacheResult); if (cacheResult.success && cacheResult.contentLength > 100) { log(`✅ ${cacheService.name} extraction successful!`); return { success: true, result: cacheResult }; } } catch (error) { log(`❌ ${cacheService.name} failed: ${error.message}`); } } } // Try 3: Alternative Jina formats (pattern-based) if (!originalOptions.triedAltJina) { log(`🔄 Trying alternative Jina AI formats...`); for (const jinaFormat of FALLBACK_SERVICES.jinaFormats) { try { const altURL = jinaFormat.pattern(url); const altResult = await extractWithJinaPublic(altURL, { ...originalOptions, triedAltJina: true, timeout: jinaFormat.timeout }); results.push(altResult); if (altResult.success && altResult.contentLength > 50) { log(`✅ Jina AI (${jinaFormat.name}) extraction successful!`); return { success: true, result: altResult }; } } catch (error) { log(`❌ Jina AI (${jinaFormat.name}) failed: ${error.message}`); } } } // Try 4: Connection/SSL workarounds with remaining user agents const lastResult = results[results.length - 1]; if (!originalOptions.triedSSLWorkaround && (lastResult?.error?.message?.includes('certificate') || lastResult?.error?.message?.includes('SSL') || lastResult?.error?.message?.includes('ECONNREFUSED') || lastResult?.error?.message?.includes('timeout'))) { log(`🔐 Trying connection/SSL workarounds with remaining user agents...`); for (const userAgent of FALLBACK_SERVICES.userAgents.slice(2)) { // Skip first 2 as they were tried above try { const workaroundResult = await extractWithJinaPublic(url, { ...originalOptions, triedSSLWorkaround: true, ...userAgent }); results.push(workaroundResult); if (workaroundResult.success && workaroundResult.contentLength > 0) { log(`✅ SSL/Connection workaround (${userAgent.name}) extraction successful!`); return { success: true, result: workaroundResult }; } } catch (error) { log(`❌ SSL/Connection workaround (${userAgent.name}) failed: ${error.message}`); } } } log(`🏁 Ultra-resilient fallback attempts completed (${results.length - 3} additional attempts)`); return { success: false, result: lastResult }; } /** * Extracts error code from error message for classification */ function extractErrorCode(errorMessage) { if (errorMessage.includes('403')) return '403'; if (errorMessage.includes('429')) return '429'; if (errorMessage.includes('451')) return '451'; if (errorMessage.includes('400')) return '400'; if (errorMessage.includes('404')) return '404'; if (errorMessage.includes('timeout')) return 'TIMEOUT'; if (errorMessage.includes('ECONNREFUSED')) return 'ECONNREFUSED'; if (errorMessage.includes('incorrect header check')) return 'HEADER_CHECK'; if (errorMessage.includes('SecurityCompromiseError')) return 'SECURITY_COMPROMISE'; if (errorMessage.includes('Forbidden')) return 'FORBIDDEN'; return 'UNKNOWN'; } /** * Smart 404 Configuration System * Provides intelligent 404 handling with user-configurable modes */ // Mode presets for different 404 handling strategies const MODE_PRESETS = { disabled: { enabled: false, archiveProbability: 0.0, maxArchiveAttempts: 0, description: 'Skip all archive attempts for 404 errors (fastest)' }, conservative: { enabled: true, archiveProbability: 0.3, maxArchiveAttempts: 1, description: 'Try archives for 30% of 404s, high-value domains only' }, normal: { enabled: true, archiveProbability: 0.7, maxArchiveAttempts: 2, description: 'Balanced approach for most use cases' }, aggressive: { enabled: true, archiveProbability: 1.0, maxArchiveAttempts: 3, description: 'Try all archives for every 404 (maximum recovery)' } }; /** * Creates 404 configuration from user options */ function create404Config(options = {}) { // Check environment variable first, then options, then default to normal mode let mode = process.env.SEARCH_PLUS_404_MODE || options.mode || 'normal'; // Log if environment variable is being used if (process.env.SEARCH_PLUS_404_MODE) { log(`🌍 404 mode from environment variable: ${process.env.SEARCH_PLUS_404_MODE}`); } // Validate mode if (!MODE_PRESETS[mode]) { log(`⚠️ Invalid 404 mode "${mode}", falling back to "normal"`); mode = 'normal'; } // Start with preset configuration let config = { ...MODE_PRESETS[mode] }; // Override with specific options (power user customization) if (options.archiveProbability !== undefined) { config.archiveProbability = Math.max(0.0, Math.min(1.0, options.archiveProbability)); } if (options.maxArchiveAttempts !== undefined) { config.maxArchiveAttempts = Math.max(0, Math.min(5, options.maxArchiveAttempts)); } if (options.enabled !== undefined) { config.enabled = options.enabled; } // Add domain classifications config.highValueDomains = options.highValueDomains || [ 'docs.', 'documentation.', 'help.', 'support.', 'news.', 'blog.', 'article.', 'research.', 'wikipedia.', 'github.', 'stackoverflow.', 'medium.', 'dev.to', 'hashnode.' ]; config.lowValuePatterns = options.lowValuePatterns || [ 'api.', 'analytics.', 'ads.', 'tracking.', 'cdn.', 'static.', 'assets.', 'temp-', 'cache-', 'session-', 'token-' ]; config.customRules = options.customRules || {}; return config; } /** * Detects 404 status from URL patterns (when content extraction fails) */ function detect404FromURL(url) { if (!url || typeof url !== 'string') return { detected: false, patterns: [], source: 'url' }; const urlLower = url.toLowerCase(); // URL patterns that strongly indicate 404 status const urlPatterns = [ '/status/404', '/error/404', '/404.html', '/not-found', '/page-not-found' ]; const detectedPatterns = urlPatterns.filter(pattern => urlLower.includes(pattern)); return { detected: detectedPatterns.length > 0, patterns: detectedPatterns, source: 'url', confidence: detectedPatterns.length > 0 ? 0.8 : 0.0 }; } /** * Detects if content contains 404 error patterns * Now used for intelligent decision-making instead of blocking */ function detect404Error(content) { if (!content || typeof content !== 'string') return { detected: false, patterns: [] }; const contentLower = content.toLowerCase(); // 404 indicator patterns const patterns404 = [ '404: not found', 'error 404: not found', 'this page can\'t be found', 'page not found', 'lost in space', 'the page you\'re seeking might no longer exist', 'target url returned error 404', 'http 404', 'status: 404', 'this httpbin.org page can\'t be found' ]; const detectedPatterns = []; // Check for 404 patterns for (const pattern of patterns404) { if (contentLower.includes(pattern)) { detectedPatterns.push(pattern); } } return { detected: detectedPatterns.length > 0, patterns: detectedPatterns, confidence: Math.min(detectedPatterns.length / 3, 1.0) }; } /** * Determines if a URL should get archive recovery attempts */ function shouldTryArchives(url, detectionResult, config) { // Quick disable checks if (!config.enabled) return false; if (!detectionResult.detected) return true; // Not a 404, always try // Probability check if (Math.random() > config.archiveProbability) return false; // High-value domain check (always try for these) if (isHighValueDomain(url, config)) return true; // Low-value pattern check (skip these unless aggressive mode) if (isLowValueContent(url, config) && config.archiveProbability < 1.0) return false; // Custom rules check for (const [domain, rule] of Object.entries(config.customRules)) { if (url.includes(domain)) { return rule === 'always' || (rule === 'try' && Math.random() < 0.5); } } return true; } /** * Checks if URL is from a high-value domain that deserves archive recovery */ function isHighValueDomain(url, config) { const urlLower = url.toLowerCase(); return config.highValueDomains.some(domain => urlLower.includes(domain)); } /** * Checks if URL is low-value content that doesn't need archive recovery */ function isLowValueContent(url, config) { const urlLower = url.toLowerCase(); return config.lowValuePatterns.some(pattern => urlLower.includes(pattern)); } /** * Validates if extracted content is meaningful or just service error pages */ function validateMeaningfulContent(content, source = 'unknown') { if (!content || typeof content !== 'string' || content.trim().length === 0) { return { isMeaningful: false, reason: 'empty_content', source }; } const contentLower = content.toLowerCase(); // Patterns that indicate non-meaningful content (error pages, "no results" pages, etc.) const uselessPatterns = [ // Google Cache/Search error patterns 'did not match any documents', 'no cached version available', 'accessibility links', 'google apps', 'your search -', 'suggestions:', 'make sure all words are spelled correctly', 'footer links', // Jina.ai error patterns 'jina ai reader', 'failed to extract content', 'extraction failed', 'unable to access', 'error 404', 'error 403', 'error 429', 'error 451', 'timeouterror', 'navigation timeout', // Generic error patterns 'page not found', 'access denied', 'forbidden', 'rate limit', 'service unavailable', 'connection refused', // Cache service error patterns 'wayback machine', 'archive.org', 'this page is not available', 'cached page', 'webcache.googleusercontent.com', // Minimal content patterns 'title: cache:', 'url source:', 'markdown content:' ]; // Check for useless patterns for (const pattern of uselessPatterns) { if (contentLower.includes(pattern)) { return { isMeaningful: false, reason: 'useless_pattern_detected', pattern: pattern, source }; } } // Check for extremely short content (likely error pages) const contentLength = content.trim().length; if (contentLength < 100) { return { isMeaningful: false, reason: 'content_too_short', length: contentLength, source }; } // Check for content that's mostly HTML/structure without meaningful text const textContent = content.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim(); if (textContent.length < 50) { return { isMeaningful: false, reason: 'insufficient_text_content', textLength: textContent.length, source }; } // Check for repetitive content (indicates error pages or broken extraction) const words = textContent.split(' ').filter(w => w.length > 3); const uniqueWords = new Set(words); if (words.length > 10 && uniqueWords.size / words.length < 0.3) { return { isMeaningful: false, reason: 'repetitive_content', uniqueWordsRatio: uniqueWords.size / words.length, source }; } return { isMeaningful: true, reason: 'meaningful_content_detected', contentLength: contentLength, textLength: textContent.length, source }; } /** * Determines the fallback level based on service used and number of attempts */ function determineFallbackLevel(service, totalAttempts) { if (service === 'tavily') return 'primary'; if (service === 'jinaPublic') return 'secondary'; if (service === 'jinaAPI') return 'tertiary'; if (totalAttempts > 4) return 'ultra_resilient'; return 'unknown'; } /** * Determines the extraction strategy used */ function determineStrategy(isDoc, useEnhancedMetadata) { if (useEnhancedMetadata) return 'tavily_first_enhanced_metadata'; if (isDoc) return 'tavily_first_optimal_fallback'; return 'tavily_first_default'; } /** * Checks if an IP address is in a private or reserved range. * @param {string} ip - The IP address to check. * @returns {boolean} - True if the IP is private, false otherwise. */ function isPrivateIP(ip) { if (net.isIPv4(ip)) { const parts = ip.split('.').map(part => parseInt(part, 10)); // 127.0.0.0/8 - Loopback if (parts[0] === 127) return true; // 10.0.0.0/8 - Private if (parts[0] === 10) return true; // 172.16.0.0/12 - Private if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true; // 192.168.0.0/16 - Private if (parts[0] === 192 && parts[1] === 168) return true; // 169.254.0.0/16 - Link-local (includes AWS metadata service) if (parts[0] === 169 && parts[1] === 254) return true; } // No IPv6 checks for now as per requirements, but can be added. return false; } /** * Validates and normalizes malformed URLs before extraction */ async function validateAndNormalizeURL(url) { const issues = []; let normalizedURL = url; // Check for double protocol issues if (url.includes('http://https://') || url.includes('https://http://')) { issues.push('double_protocol'); // Fix double protocol normalizedURL = url.replace(/https?:\/\/https?:\/\//, 'https://'); } // Check for spaces in URL (common issue from "textise dot iitty") if (url.includes(' dot ') || url.includes(' ')) { issues.push('spaces_in_domain'); // Try to fix common patterns normalizedURL = normalizedURL.replace(/ dot /g, '.').replace(/\s+/g, ''); } // Check for malformed Jina AI URLs if (url.includes('r.jina.ai/http://') && !url.includes('r.jina.ai/http://https://')) { issues.push('malformed_jina_url'); // This is actually the correct pattern for Jina AI } // Basic URL validation and SSRF Protection let parsedURL; try { parsedURL = new URL(normalizedURL); } catch (error) { issues.push('invalid_url_format'); return { valid: false, issues, error: `Invalid URL format: ${error.message}`, originalURL: url, normalizedURL: null }; } // SSRF Protection Step 1: Protocol check if (parsedURL.protocol !== 'http:' && parsedURL.protocol !== 'https:') { issues.push('invalid_protocol'); return { valid: false, issues, error: `SSRF attack detected: Invalid protocol '${parsedURL.protocol}'. Only HTTP and HTTPS are allowed.`, originalURL: url, normalizedURL }; } const { hostname } = parsedURL; // SSRF Protection Step 2: Hostname check if (hostname === 'localhost' || hostname.endsWith('.local')) { issues.push('forbidden_hostname'); return { valid: false, issues, error: `SSRF attack detected: Hostname '${hostname}' is forbidden.`, originalURL: url, normalizedURL }; } // SSRF Protection Step 3: Resolve hostname to IP and check let ipAddress; if (net.isIP(hostname)) { ipAddress = hostname; } else { try { const { address } = await dns.lookup(hostname); ipAddress = address; } catch (error) { issues.push('dns_lookup_failed'); return { valid: false, issues, error: `DNS lookup failed for hostname: ${hostname}. ${error.message}`, originalURL: url, normalizedURL: null }; } } if (isPrivateIP(ipAddress)) { issues.push('private_ip_detected'); return { valid: false, issues, error: `SSRF attack detected: IP address ${ipAddress} is in a forbidden range.`, originalURL: url, normalizedURL }; } // Check for obviously problematic domains that would cause API failures const problematicPatterns = [ /textise dot iitty/i, /textise\.iitty/i, // The normalized version is still invalid /example dot com/i, /example\.com$/i, // Generic example domain /test dot /i, /\.com\.[a-z]/i, // Likely malformed TLD /r\.jina\.ai\/http:\/\/[^/]*\.[a-z]{2,}\/?$/i // Jina AI with obviously fake domain ]; for (const pattern of problematicPatterns) { if (pattern.test(normalizedURL)) { issues.push('suspicious_domain_pattern'); break; } } // If we have suspicious patterns that can't be trusted, mark as invalid if (issues.includes('suspicious_domain_pattern')) { return { valid: false, issues, error: `Unfixable URL issues: suspicious or test domain detected`, originalURL: url, normalizedURL: null }; } // If we have issues but can normalize, return the fixed version if (issues.length > 0 && normalizedURL !== url) { return { valid: true, issues, originalURL: url, normalizedURL, hasFixes: true, message: `URL normalized: ${issues.join(', ')}` }; } // If we have issues that can't be automatically fixed if (issues.length > 0) { return { valid: false, issues, error: `Unfixable URL issues: ${issues.join(', ')}`, originalURL: url, normalizedURL: null }; } // URL is valid return { valid: true, issues: [], originalURL: url, normalizedURL: url, hasFixes: false }; } /** * Performs comprehensive service health check */ async function performServiceHealthCheck() { const healthStatus = { tavily: { available: false, error: null }, jinaPublic: { available: false, error: null }, jinaAPI: { available: false, error: null } }; // Check Tavily API const tavilyValidation = await validateTavilyAPIKey(); healthStatus.tavily.available = tavilyValidation.valid; healthStatus.tavily.error = tavilyValidation.reason; // Check Jina Public try { const jinaTest = await fetch('https://r.jina.ai/http://example.com', { method: 'GET', signal: AbortSignal.timeout(5000) }); healthStatus.jinaPublic.available = jinaTest.ok; if (!jinaTest.ok) { healthStatus.jinaPublic.error = `HTTP ${jinaTest.status}`; } } catch (error) { healthStatus.jinaPublic.error = error.message; } // Check Jina API (if key is available) if (JINA_API_KEY) { try { const jinaAPITest = await fetch('https://r.jina.ai/', { method: 'POST', headers: { 'Authorization': `Bearer ${JINA_API_KEY}`, 'Content-Type': 'application/json' }, body: JSON.stringify({ url: 'http://example.com' }), signal: AbortSignal.timeout(5000) }); healthStatus.jinaAPI.available = jinaAPITest.ok; if (!jinaAPITest.ok) { healthStatus.jinaAPI.error = `HTTP ${jinaAPITest.status}`; } } catch (error) { healthStatus.jinaAPI.error = error.message; } } else { healthStatus.jinaAPI.available = false; healthStatus.jinaAPI.error = 'API key not configured'; } return healthStatus; } /** * Enhanced content extraction with optimal service selection strategy * * Strategy based on comprehensive research: * 1. Always start with Tavily (100% success rate, 863ms fastest) - PRIMARY CHOICE * 2. Documentation sites: Tavily First → Jina Public Fallback (better content for docs) * 3. Cost tracking: Tavily First → Jina API Fallback (only for token tracking) */ export async function extractContent(url, options = {}) { const startTime = Date.now(); const results = []; // Perform service health check at the start if (options.performHealthCheck !== false) { log(`🔍 Performing service health check...`); const healthStatus = await performServiceHealthCheck(); log(`📊 Service Health Status:`); log(` Tavily: ${healthStatus.tavily.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.tavily.error}`); log(` Jina Public: ${healthStatus.jinaPublic.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaPublic.error}`); log(` Jina API: ${healthStatus.jinaAPI.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaAPI.error}`); // If no services are available, fail early if (!healthStatus.tavily.available && !healthStatus.jinaPublic.available && !healthStatus.jinaAPI.available) { return { success: false, error: { code: 'ALL_SERVICES_DOWN', message: 'All extraction services are unavailable' }, content: '', contentLength: 0, service: 'none', url, responseTime: Date.now() - startTime, totalAttempts: 0, totalResponseTime: Date.now() - startTime, healthStatus, metadata: { extractionStrategy: 'all_services_failed', timestamp: new Date().toISOString() } }; } } // Initialize 404 configuration for smart handling const config404 = create404Config(options.config404 || { mode: 'normal' }); log(`🎯 404 Handling: ${config404.description}`); // Pre-validate and normalize URL before extraction const urlValidation = await validateAndNormalizeURL(url); let extractionURL = url; if (!urlValidation.valid) { log(`❌ URL validation failed: ${urlValidation.error}`); return { success: false, error: { code: 'INVALID_URL', message: urlValidation.error, issues: urlValidation.issues }, content: '', contentLength: 0, service: 'validation', url, responseTime: Date.now() - startTime, totalAttempts: 0, totalResponseTime: Date.now() - startTime, metadata: { extractionStrategy: 'url_validation_failed', timestamp: new Date().toISOString(), originalURL: urlValidation.originalURL, validationIssues: urlValidation.issues } }; } if (urlValidation.hasFixes) { log(`🔧 URL normalized: ${urlValidation.message}`); log(` Original: ${urlValidation.originalURL}`); log(` Normalized: ${urlValidation.normalizedURL}`); extractionURL = urlValidation.normalizedURL; } // Determine optimal strategy based on URL characteristics const isDoc = isDocumentationSite(extractionURL); const isProblematic = isProblematicDomain(extractionURL); const useEnhancedMetadata = options.enhancedMetadata || options.highVolume; log(`🎯 Extracting content from: ${extractionURL}`); if (extractionURL !== url) { log(` (Original URL: ${url})`); } log(` URL Type: ${isDoc ? 'Documentation site' : isProblematic ? 'Problematic domain' : 'General URL'}`); log(` Enhanced Metadata: ${useEnhancedMetadata ? 'enabled' : 'disabled'}`); let result; // Strategy 1: Always start with Tavily (research shows it's fastest and most reliable) log(`🚀 Using Tavily first...`); try { result = await extractWithTavily(extractionURL, options); results.push(result); } catch (error) { result = { success: false, error: { code: 'EXCEPTION', message: error.message }, service: 'tavily', url: extractionURL, originalURL: url, responseTime: Date.now() - startTime, content: '', contentLength: 0 }; results.push(result); log(`❌ Tavily extraction failed with exception: ${error.message}`); } // Determine fallback service based on specific needs and service availability let fallbackService = 'jinaPublic'; // Default fallback let fallbackReason = 'default'; if (useEnhancedMetadata && JINA_API_KEY) { fallbackService = 'jinaAPI'; fallbackReason = 'enhanced metadata requested'; } else if (isDoc) { fallbackService = 'jinaPublic'; fallbackReason = 'documentation site'; } // Enhanced fallback logic with better error detection const needsFallback = !result.success || result.error?.code === '401' || // Invalid API key result.error?.code === '403' || // Forbidden result.error?.code === '429' || // Rate limited result.error?.code === 'EXCEPTION' || // Exception occurred (result.contentLength === 0 && !options.skipEmptyFallback) || (useEnhancedMetadata && !result.success); if (needsFallback) { log(`⚠️ Tavily failed or returned empty, trying ${fallbackService} (${fallbackReason})...`); log(` Failure reason: ${result.error?.code || result.error?.message || 'Empty content'}`); let fallbackResult; try { if (fallbackService === 'jinaAPI' && JINA_API_KEY) { fallbackResult = await extractWithJinaAPI(extractionURL, options); } else { fallbackResult = await extractWithJinaPublic(extractionURL, options); } results.push(fallbackResult); // Use fallback if it succeeded if (fallbackResult.success && (fallbackResult.contentLength > 0 || useEnhancedMetadata)) { result = fallbackResult; log(`✅ Fallback to ${fallbackService} successful`); // Smart 404 detection for logging and metrics const detection404 = detect404Error(result.content); if (detection404.detected) { log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`); result.fallback404Detection = detection404; } } else { log(`❌ Fallback to ${fallbackService} failed: ${fallbackResult.error?.message || 'Empty content'}`); } } catch (error) { log(`❌ Fallback to ${fallbackService} failed with exception: ${error.message}`); fallbackResult = { success: false, error: { code: 'EXCEPTION', message: error.message }, service: fallbackService, url: extractionURL, originalURL: url, responseTime: Date.now() - startTime, content: '', contentLength: 0 }; results.push(fallbackResult); } } // Final fallback if needed (try the remaining service) if ((!result.success || result.contentLength === 0) && !useEnhancedMetadata && JINA_API_KEY) { const finalService = fallbackService === 'jinaPublic' ? 'jinaAPI' : 'jinaPublic'; log(`🔄 Final fallback to ${finalService}...`); try { const finalFallback = finalService === 'jinaAPI' ? await extractWithJinaAPI(extractionURL, options) : await extractWithJinaPublic(extractionURL, options); results.push(finalFallback); if (finalFallback.success && finalFallback.contentLength > 0) { result = finalFallback; log(`✅ Final fallback to ${finalService} successful`); // Smart 404 detection for logging and metrics const detection404 = detect404Error(result.content); if (detection404.detected) { log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`); result.finalFallback404Detection = detection404; } } else { log(`❌ Final fallback to ${finalService} failed`); } } catch (error) { log(`❌ Final fallback to ${finalService} failed with exception: ${error.message}`); results.push({ success: false, error: { code: 'EXCEPTION', message: error.message }, service: finalService, url: extractionURL, originalURL: url, responseTime: Date.now() - startTime, content: '', contentLength: 0 }); } } // Ultra-resilient fallback: Try pattern-based alternative approaches if all standard services failed // Use smart 404 configuration to decide whether to attempt recovery if (!result.success || result.contentLength === 0) { // Get the best 404 detection result we have let detection404 = result.fallback404Detection || result.finalFallback404Detection || { detected: false }; // If no content-based detection worked, try URL-based detection if (!detection404.detected) { detection404 = detect404FromURL(extractionURL); } // Determine if we should try archive recovery const shouldTry = shouldTryArchives(extractionURL, detection404, config404); if (shouldTry) { log(`🚨 Trying ultra-resilient fallbacks with 404 configuration...`); log(` 404 detected: ${detection404.detected} (source: ${detection404.source || 'content'}), Archive probability: ${config404.archiveProbability}`); // Pass 404 config to the ultra-resilient fallback system const ultraResilientOptions = { ...options, config404, maxArchiveAttempts: config404.maxArchiveAttempts }; const ultraResilientResult = await tryUltraResilientFallbacks(extractionURL, ultraResilientOptions, results); if (ultraResilientResult.success) { result = ultraResilientResult.result; results.push(ultraResilientResult.result); log(`✅ Ultra-resilient fallback successful via ${ultraResilientResult.result.service}`); } else { log(`❌ Ultra-resilient fallbacks also failed`); } } else { if (detection404.detected) { log(`⏭️ Skipping ultra-resilient fallbacks (404 detected, configuration: ${config404.mode})`); } else { log(`⏭️ Skipping ultra-resilient fallbacks (disabled by configuration)`); } } } const totalTime = Date.now() - startTime; // Return the successful result or the last attempted result // But only consider it successful if at least one service actually worked const hasAnySuccessfulService = results.some(r => r.success && (r.contentLength > 0 || useEnhancedMetadata)); const successfulResult = hasAnySuccessfulService ? results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata)) : result; // Validate if the content is actually meaningful const contentValidation = validateMeaningfulContent(successfulResult.content, successfulResult.service); // Detect 404 patterns for metrics and intelligent handling const detection404 = detect404Error(successfulResult.content); // Determine honest success metrics const technicalSuccess = successfulResult.success && successfulResult.contentLength > 0; const meaningfulSuccess = technicalSuccess && contentValidation.isMeaningful; const fallbackLevel = determineFallbackLevel(successfulResult.service, results.length); // Log content validation results for debugging if (technicalSuccess && !meaningfulSuccess) { log(`⚠️ Technical success but content validation failed:`); log(` Reason: ${contentValidation.reason}${contentValidation.pattern ? ` (${contentValidation.pattern})` : ''}`); log(` Source: ${contentValidation.source}`); } else if (meaningfulSuccess) { log(`✅ Meaningful content extracted successfully (${contentValidation.contentLength} chars)`); } const finalResult = { ...successfulResult, // Legacy success field (for backwards compatibility) success: hasAnySuccessfulService, // Enhanced success reporting technicalSuccess, meaningfulSuccess, contentValidation, fallbackLevel, totalAttempts: results.length, totalResponseTime: totalTime, strategy: { isDocumentationSite: isDoc, isProblematicDomain: isProblematic, enhancedMetadataEnabled: useEnhancedMetadata, primaryService: 'tavily', // ALWAYS Tavily first fallbackService, fallbackReason }, allResults: results, metadata: { ...successfulResult.metadata, extractionStrategy: 'tavily_first_optimal_fallback', timestamp: new Date().toISOString(), totalTokensUsed: results.reduce((sum, r) => sum + (r.metadata?.tokenUsage || 0), 0), urlValidation: { originalURL: url, normalizedURL: extractionURL, wasNormalized: urlValidation.hasFixes, validationIssues: urlValidation.issues, validationMessage: urlValidation.message }, allServicesFailed: !hasAnySuccessfulService, ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0, attemptedServices: results.map(r => r.service), successfulService: hasAnySuccessfulService ? results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata))?.service : null, // New meaningful content metrics honestSuccessMetrics: { technicalSuccess, meaningfulSuccess, fallbackLevel, contentQuality: contentValidation.isMeaningful ? 'meaningful' : 'useless', contentIssues: contentValidation.isMeaningful ? null : { reason: contentValidation.reason, pattern: contentValidation.pattern, source: contentValidation.source }, // 404 handling metrics handling404: { detected404: detection404?.detected || false, fourOFourPatterns: (detection404 && detection404.patterns) ? detection404.patterns : [], fourOFourConfidence: detection404?.confidence || 0, attemptedArchives: shouldTryArchives(extractionURL, detection404 || { detected: false }, config404), archiveMode: config404.mode, archiveProbability: config404.archiveProbability, maxArchiveAttempts: config404.maxArchiveAttempts, isHighValueDomain: isHighValueDomain(extractionURL, config404), isLowValueContent: isLowValueContent(extractionURL, config404) } } } }; // If all services failed, add appropriate error information if (!hasAnySuccessfulService) { finalResult.error = { code: 'ALL_SERVICES_FAILED', message: 'All extraction services failed to retrieve content', attempts: results.length, serviceResults: results.map(r => ({ service: r.service, success: r.success, error: r.error?.code })), ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0 }; } return finalResult; } /** * Performs a search using the Tavily API with enhanced error handling * @param {Object} params - Search parameters * @param {number} timeoutMs - Request timeout in milliseconds * @returns {Object} Search results */ export const tavily = { search: async function tavilySearch(params, timeoutMs = 15000) { const startTime = Date.now(); if (!TAVILY_API_KEY) { throw new Error('Tavily API key not configured'); } // Construct the request payload const requestBody = { api_key: TAVILY_API_KEY, query: params.query, max_results: params.maxResults || 5, include_answer: params.includeAnswer !== false, // Default to true include_raw_content: params.includeRawContent || false, num_days: params.numDays || 30, // Look back 30 days by default }; // Add headers if provided const headers = { 'Content-Type': 'application/json', ...params.headers }; try { // Create AbortController for timeout handling const controller = new AbortController(); const timeoutId = setTimeout(timeoutMs, null).then(() => { controller.abort(); }); // Make the API request const response = await fetch('https://api.tavily.com/search', { method: 'POST', headers, body: JSON.stringify(requestBody), signal: controller.signal }); // Clear the timeout if the request completes in time clearTimeout(timeoutId); if (!response.ok) { const errorData = await response.json().catch(() => ({})); throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`); } const data = await response.json(); return data; } catch (error) { if (error.name === 'AbortError') { throw new Error(`Request timeout after ${timeoutMs}ms`); } else if (error.code === 'ECONNREFUSED') { throw new Error(`Connection refused when trying to reach Tavily API: ${error.message}`); } else { throw error; } } } }; /** * Simple logging function (can be replaced with proper logging) */ function log(message) { console.log(`[ContentExtractor] ${message}`); } /** * Batch content extraction for multiple URLs */ export async function extractContentBatch(urls, options = {}) { const results = []; const concurrency = options.concurrency || 3; log(`📦 Batch extracting ${urls.length} URLs with concurrency ${concurrency}`); for (let i = 0; i < urls.length; i += concurrency) { const batch = urls.slice(i, i + concurrency); const batchPromises = batch.map(url => extractContent(url, options)); const batchResults = await Promise.allSettled(batchPromises); batchResults.forEach((result, index) => { const url = batch[index]; if (result.status === 'fulfilled') { results.push({ url, ...result.value }); } else { results.push({ url, success: false, error: { code: 'BATCH_ERROR', message: result.reason.message }, content: '', contentLength: 0, service: 'batch_failed' }); } }); // Small delay between batches to be respectful to rate limits if (i + concurrency < urls.length) { await new Promise(resolve => setTimeout(resolve, 1000)); } } const successCount = results.filter(r => r.success && (r.contentLength > 0 || options.enhancedMetadata)).length; log(`✅ Batch extraction complete: ${successCount}/${urls.length} successful`); return { results, summary: { total: urls.length, successful: successCount, failed: urls.length - successCount, successRate: Math.round((successCount / urls.length) * 100) } }; } export default { extractContent, extractContentBatch, tavily, SERVICES, isDocumentationSite, isProblematicDomain };