Files
gh-shrwnsan-vibekit-claude-…/hooks/content-extractor.mjs
2025-11-30 08:57:03 +08:00

1706 lines
52 KiB
JavaScript

// hooks/content-extractor.mjs
import { setTimeout } from 'timers/promises';
import { promises as dns } from 'dns';
import net from 'net';
/**
* Enhanced Content Extractor with Service Selection Strategy
*
* Implements optimal fallback strategy based on comprehensive testing:
* Primary: Tavily Extract API (100% success rate, 863ms avg) - FASTEST AND MOST RELIABLE
* Fallback: Jina.ai Public Endpoint (75% success rate, 1,066ms avg) - Good for documentation
* Optional: Jina.ai API (88% success rate, 2,331ms avg) - Slower, for cost tracking only
*/
// Scalable fallback service definitions
const FALLBACK_SERVICES = {
cacheServices: [
{
name: 'Google Web Cache',
pattern: (url) => `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`,
timeout: 15000,
priority: 1,
notes: 'Google web cache - fastest but sometimes blocked'
},
{
name: 'Internet Archive JSON API',
pattern: async (url) => {
try {
const response = await fetch(`https://archive.org/wayback/available?url=${encodeURIComponent(url)}`, {
timeout: 10000,
headers: { 'Accept': 'application/json' }
});
const data = await response.json();
if (data.archived_snapshots?.closest?.available) {
return data.archived_snapshots.closest.url;
}
return null;
} catch (error) {
return null;
}
},
timeout: 15000,
priority: 2,
notes: 'Archive.org official API - most reliable for older content'
},
{
name: 'Internet Archive Direct',
pattern: (url) => `https://web.archive.org/web/2/${encodeURIComponent(url)}`,
timeout: 20000,
priority: 3,
notes: 'Direct archive.org access'
},
{
name: 'Bing Cache',
pattern: (url) => `https://cc.bingj.com/cache.aspx?d=&w=${encodeURIComponent(url)}`,
timeout: 20000,
priority: 4,
notes: 'Microsoft Bing cache - alternative to Google'
},
{
name: 'Yandex Turbo',
pattern: (url) => `https://yandex.com/turbo?text=${encodeURIComponent(url)}`,
timeout: 15000,
priority: 5,
notes: 'Yandex turbo mode - often good for news/blog content'
}
],
jinaFormats: [
{
name: 'Standard',
pattern: (url) => url,
timeout: 10000
},
{
name: 'Double Redirect',
pattern: (url) => `https://r.jina.ai/http://${encodeURIComponent(url)}`,
timeout: 12000
},
{
name: 'Triple Redirect',
pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://${encodeURIComponent(url)}`,
timeout: 15000
},
{
name: 'Text Extractor',
pattern: (url) => `https://r.jina.ai/http://r.jina.ai/http://textise dot iitty?url=${encodeURIComponent(url)}`,
timeout: 10000
}
],
userAgents: [
{
name: 'Chrome Browser',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
},
timeout: 30000
},
{
name: 'cURL',
headers: {
'User-Agent': 'curl/8.0.0',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
timeout: 20000
},
{
name: 'Python Requests',
headers: {
'User-Agent': 'python-requests/2.31.0',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
timeout: 15000
},
{
name: 'Wget',
headers: {
'User-Agent': 'Wget/1.21.3',
'Accept': '*/*',
'Accept-Encoding': 'identity'
},
timeout: 25000
}
]
};
// Service configuration with fallback for backward compatibility
const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null;
const JINA_API_KEY = process.env.SEARCH_PLUS_JINA_API_KEY || process.env.JINA_API_KEY || null;
// Show deprecation warning if using old variables
if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) {
console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY');
}
if (!process.env.SEARCH_PLUS_JINA_API_KEY && process.env.JINA_API_KEY) {
console.warn('⚠️ JINA_API_KEY is deprecated. Please update to SEARCH_PLUS_JINA_API_KEY');
}
const TAVILY_EXTRACT_URL = 'https://api.tavily.com/extract';
const JINA_READER_PUBLIC_URL = 'https://r.jina.ai/';
const JINA_READER_API_URL = 'https://r.jina.ai/';
// Service configuration based on research findings
const SERVICES = {
tavily: {
name: 'Tavily Extract API',
url: TAVILY_EXTRACT_URL,
successRate: 100,
avgResponseTime: 863,
cost: 'paid',
requiresAuth: true,
bestFor: ['general', 'problematic_domains', 'financial', 'social_media', 'primary_choice']
},
jinaPublic: {
name: 'Jina.ai Public Reader',
url: JINA_READER_PUBLIC_URL,
successRate: 75,
avgResponseTime: 1066,
cost: 'free',
requiresAuth: false,
bestFor: ['documentation', 'api_docs', 'technical_content']
},
jinaAPI: {
name: 'Jina.ai API Reader',
url: JINA_READER_API_URL,
successRate: 88,
avgResponseTime: 2331,
cost: 'free',
requiresAuth: true,
bestFor: ['enhanced_metadata', 'reliability'] // 2.7x slower - provides detailed analytics
}
};
/**
* Determines if a URL is likely to be documentation-heavy
* Based on research showing Jina.ai excels at documentation extraction
*/
function isDocumentationSite(url) {
const docPatterns = [
/docs?\./,
/documentation/,
/api.*docs/,
/developer/,
/reference/,
/guide/,
/tutorial/,
/swagger/,
/openapi/,
/postman/,
/readthedocs/,
/gitbook/
];
return docPatterns.some(pattern => pattern.test(url.toLowerCase()));
}
/**
* Determines if a URL is likely to be problematic for direct access
* Based on research showing Tavily handles these domains better
*/
function isProblematicDomain(url) {
const problematicPatterns = [
/reddit\.com/,
/finance\.yahoo\.com/,
/twitter\.com/,
/facebook\.com/,
/instagram\.com/,
/linkedin\.com/,
/medium\.com/,
/news\./,
/coingecko\.com/,
/binance\.com/
];
return problematicPatterns.some(pattern => pattern.test(url.toLowerCase()));
}
/**
* Validates Tavily API key with a simple test call
*/
async function validateTavilyAPIKey() {
if (!TAVILY_API_KEY) {
return { valid: false, reason: 'API key not configured' };
}
try {
const testResponse = await fetch('https://api.tavily.com/search', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
api_key: TAVILY_API_KEY,
query: 'test',
max_results: 1
}),
signal: AbortSignal.timeout(5000)
});
if (testResponse.status === 401 || testResponse.status === 403) {
const errorData = await testResponse.json().catch(() => ({}));
return {
valid: false,
reason: `Invalid API key: ${errorData.detail?.error || 'Unauthorized'}`
};
}
return { valid: true };
} catch (error) {
return {
valid: false,
reason: `API key validation failed: ${error.message}`
};
}
}
/**
* Extracts content using Tavily Extract API
*/
async function extractWithTavily(url, options = {}, timeoutMs = 15000) {
const startTime = Date.now();
if (!TAVILY_API_KEY) {
throw new Error('Tavily API key not configured');
}
const requestBody = {
api_key: TAVILY_API_KEY,
urls: [url.trim()]
};
// Add optional parameters
if (options.includeImages) requestBody.include_images = options.includeImages;
if (options.extractDepth) requestBody.extract_depth = options.extractDepth;
try {
const controller = new AbortController();
const timeoutId = setTimeout(timeoutMs, null).then(() => controller.abort());
const response = await fetch(TAVILY_EXTRACT_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...options.headers
},
body: JSON.stringify(requestBody),
signal: controller.signal
});
clearTimeout(timeoutId);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`);
}
const data = await response.json();
const content = data.results && data.results[0] ?
data.results[0].content || data.results[0].raw_content :
'';
return {
success: true,
content,
contentLength: content.length,
service: 'tavily',
url,
responseTime: Date.now() - startTime,
metadata: {
service: SERVICES.tavily,
responseData: data,
hasResults: data.results && data.results.length > 0,
title: data.results && data.results[0] ? data.results[0].title : null
}
};
} catch (error) {
return {
success: false,
error: {
code: extractErrorCode(error.message),
message: error.message
},
service: 'tavily',
url,
responseTime: Date.now() - startTime,
content: '',
metadata: {
service: SERVICES.tavily,
errorType: error.name
}
};
}
}
/**
* Extracts content using Jina.ai Public Endpoint
*/
async function extractWithJinaPublic(url, options = {}, timeoutMs = 10000) {
const startTime = Date.now();
try {
const jinaUrl = `${JINA_READER_PUBLIC_URL}${url}`;
const response = await fetch(jinaUrl, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Search-Plus-Content-Extractor/1.0)',
...options.headers
},
signal: AbortSignal.timeout(timeoutMs)
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Jina.ai Public error: ${response.status} - ${errorText}`);
}
const content = await response.text();
return {
success: true,
content,
contentLength: content.length,
service: 'jinaPublic',
url,
responseTime: Date.now() - startTime,
metadata: {
service: SERVICES.jinaPublic,
responseStatus: response.status,
contentType: response.headers.get('content-type')
}
};
} catch (error) {
return {
success: false,
error: {
code: extractErrorCode(error.message),
message: error.message
},
service: 'jinaPublic',
url,
responseTime: Date.now() - startTime,
content: '',
metadata: {
service: SERVICES.jinaPublic,
errorType: error.name
}
};
}
}
/**
* Extracts content using Jina.ai API (provides enhanced metadata and reliability)
*/
async function extractWithJinaAPI(url, options = {}, timeoutMs = 10000) {
const startTime = Date.now();
if (!JINA_API_KEY) {
throw new Error('Jina.ai API key not configured');
}
try {
const response = await fetch(JINA_READER_API_URL, {
method: 'POST',
headers: {
'Authorization': `Bearer ${JINA_API_KEY}`,
'Content-Type': 'application/json',
'Accept': 'application/json',
...options.headers
},
body: JSON.stringify({
url: url,
...options.jinaOptions
}),
signal: AbortSignal.timeout(timeoutMs)
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Jina.ai API error: ${response.status} - ${errorText}`);
}
const data = await response.json();
const content = data.data?.content || data.content || data.data || JSON.stringify(data);
return {
success: true,
content,
contentLength: content.length,
service: 'jinaAPI',
url,
responseTime: Date.now() - startTime,
metadata: {
service: SERVICES.jinaAPI,
responseData: data,
tokenUsage: data.meta?.usage?.tokens || data.usage?.tokens,
title: data.data?.title
}
};
} catch (error) {
return {
success: false,
error: {
code: extractErrorCode(error.message),
message: error.message
},
service: 'jinaAPI',
url,
responseTime: Date.now() - startTime,
content: '',
metadata: {
service: SERVICES.jinaAPI,
errorType: error.name
}
};
}
}
/**
* Scalable ultra-resilient fallback using pattern-based services
*/
async function tryUltraResilientFallbacks(url, originalOptions, results) {
log(`🚨 All standard services failed, trying ultra-resilient fallbacks...`);
// Try 1: Enhanced Tavily with different user agents
if (!originalOptions.triedEnhancedParams && (!results.find(r => r.error?.message?.includes('Unauthorized')))) {
log(`🔧 Trying enhanced Tavily with different user agents...`);
for (const userAgent of FALLBACK_SERVICES.userAgents.slice(0, 2)) { // Try top 2 user agents
try {
const enhancedResult = await extractWithTavily(url, {
...originalOptions,
triedEnhancedParams: true,
...userAgent
});
results.push(enhancedResult);
if (enhancedResult.success && enhancedResult.contentLength > 0) {
log(`✅ Enhanced Tavily (${userAgent.name}) extraction successful!`);
return { success: true, result: enhancedResult };
}
} catch (error) {
log(`❌ Enhanced Tavily (${userAgent.name}) failed: ${error.message}`);
}
}
}
// Try 2: Enhanced cache services (with async pattern support and prioritization)
if (!originalOptions.triedCacheServices) {
log(`🕐️ Trying enhanced cache services...`);
// Get max archive attempts from configuration (default to all if not specified)
const maxAttempts = originalOptions.maxArchiveAttempts || FALLBACK_SERVICES.cacheServices.length;
// Sort by priority and limit attempts
const sortedCacheServices = [...FALLBACK_SERVICES.cacheServices]
.sort((a, b) => a.priority - b.priority)
.slice(0, maxAttempts);
log(` Will try up to ${maxAttempts} cache services out of ${FALLBACK_SERVICES.cacheServices.length} available`);
for (const cacheService of sortedCacheServices) {
try {
let cacheURL;
// Handle async pattern functions (like Internet Archive API)
if (typeof cacheService.pattern === 'function' && cacheService.constructor.name === 'AsyncFunction') {
cacheURL = await cacheService.pattern(url);
if (!cacheURL) {
log(`⚠️ ${cacheService.name}: No cached version available`);
continue;
}
} else {
cacheURL = cacheService.pattern(url);
}
log(`🔍 Trying ${cacheService.name}: ${cacheURL.substring(0, 100)}...`);
const cacheResult = await extractWithJinaPublic(cacheURL, {
...originalOptions,
triedCacheServices: true,
timeout: cacheService.timeout
});
// Override service name to correctly identify which cache service was used
if (cacheResult.success) {
cacheResult.service = cacheService.name;
cacheResult.metadata.service = cacheService.name;
}
results.push(cacheResult);
if (cacheResult.success && cacheResult.contentLength > 100) {
log(`${cacheService.name} extraction successful!`);
return { success: true, result: cacheResult };
}
} catch (error) {
log(`${cacheService.name} failed: ${error.message}`);
}
}
}
// Try 3: Alternative Jina formats (pattern-based)
if (!originalOptions.triedAltJina) {
log(`🔄 Trying alternative Jina AI formats...`);
for (const jinaFormat of FALLBACK_SERVICES.jinaFormats) {
try {
const altURL = jinaFormat.pattern(url);
const altResult = await extractWithJinaPublic(altURL, {
...originalOptions,
triedAltJina: true,
timeout: jinaFormat.timeout
});
results.push(altResult);
if (altResult.success && altResult.contentLength > 50) {
log(`✅ Jina AI (${jinaFormat.name}) extraction successful!`);
return { success: true, result: altResult };
}
} catch (error) {
log(`❌ Jina AI (${jinaFormat.name}) failed: ${error.message}`);
}
}
}
// Try 4: Connection/SSL workarounds with remaining user agents
const lastResult = results[results.length - 1];
if (!originalOptions.triedSSLWorkaround &&
(lastResult?.error?.message?.includes('certificate') || lastResult?.error?.message?.includes('SSL') ||
lastResult?.error?.message?.includes('ECONNREFUSED') || lastResult?.error?.message?.includes('timeout'))) {
log(`🔐 Trying connection/SSL workarounds with remaining user agents...`);
for (const userAgent of FALLBACK_SERVICES.userAgents.slice(2)) { // Skip first 2 as they were tried above
try {
const workaroundResult = await extractWithJinaPublic(url, {
...originalOptions,
triedSSLWorkaround: true,
...userAgent
});
results.push(workaroundResult);
if (workaroundResult.success && workaroundResult.contentLength > 0) {
log(`✅ SSL/Connection workaround (${userAgent.name}) extraction successful!`);
return { success: true, result: workaroundResult };
}
} catch (error) {
log(`❌ SSL/Connection workaround (${userAgent.name}) failed: ${error.message}`);
}
}
}
log(`🏁 Ultra-resilient fallback attempts completed (${results.length - 3} additional attempts)`);
return { success: false, result: lastResult };
}
/**
* Extracts error code from error message for classification
*/
function extractErrorCode(errorMessage) {
if (errorMessage.includes('403')) return '403';
if (errorMessage.includes('429')) return '429';
if (errorMessage.includes('451')) return '451';
if (errorMessage.includes('400')) return '400';
if (errorMessage.includes('404')) return '404';
if (errorMessage.includes('timeout')) return 'TIMEOUT';
if (errorMessage.includes('ECONNREFUSED')) return 'ECONNREFUSED';
if (errorMessage.includes('incorrect header check')) return 'HEADER_CHECK';
if (errorMessage.includes('SecurityCompromiseError')) return 'SECURITY_COMPROMISE';
if (errorMessage.includes('Forbidden')) return 'FORBIDDEN';
return 'UNKNOWN';
}
/**
* Smart 404 Configuration System
* Provides intelligent 404 handling with user-configurable modes
*/
// Mode presets for different 404 handling strategies
const MODE_PRESETS = {
disabled: {
enabled: false,
archiveProbability: 0.0,
maxArchiveAttempts: 0,
description: 'Skip all archive attempts for 404 errors (fastest)'
},
conservative: {
enabled: true,
archiveProbability: 0.3,
maxArchiveAttempts: 1,
description: 'Try archives for 30% of 404s, high-value domains only'
},
normal: {
enabled: true,
archiveProbability: 0.7,
maxArchiveAttempts: 2,
description: 'Balanced approach for most use cases'
},
aggressive: {
enabled: true,
archiveProbability: 1.0,
maxArchiveAttempts: 3,
description: 'Try all archives for every 404 (maximum recovery)'
}
};
/**
* Creates 404 configuration from user options
*/
function create404Config(options = {}) {
// Check environment variable first, then options, then default to normal mode
let mode = process.env.SEARCH_PLUS_404_MODE || options.mode || 'normal';
// Log if environment variable is being used
if (process.env.SEARCH_PLUS_404_MODE) {
log(`🌍 404 mode from environment variable: ${process.env.SEARCH_PLUS_404_MODE}`);
}
// Validate mode
if (!MODE_PRESETS[mode]) {
log(`⚠️ Invalid 404 mode "${mode}", falling back to "normal"`);
mode = 'normal';
}
// Start with preset configuration
let config = { ...MODE_PRESETS[mode] };
// Override with specific options (power user customization)
if (options.archiveProbability !== undefined) {
config.archiveProbability = Math.max(0.0, Math.min(1.0, options.archiveProbability));
}
if (options.maxArchiveAttempts !== undefined) {
config.maxArchiveAttempts = Math.max(0, Math.min(5, options.maxArchiveAttempts));
}
if (options.enabled !== undefined) {
config.enabled = options.enabled;
}
// Add domain classifications
config.highValueDomains = options.highValueDomains || [
'docs.', 'documentation.', 'help.', 'support.',
'news.', 'blog.', 'article.', 'research.',
'wikipedia.', 'github.', 'stackoverflow.',
'medium.', 'dev.to', 'hashnode.'
];
config.lowValuePatterns = options.lowValuePatterns || [
'api.', 'analytics.', 'ads.', 'tracking.',
'cdn.', 'static.', 'assets.', 'temp-',
'cache-', 'session-', 'token-'
];
config.customRules = options.customRules || {};
return config;
}
/**
* Detects 404 status from URL patterns (when content extraction fails)
*/
function detect404FromURL(url) {
if (!url || typeof url !== 'string') return {
detected: false,
patterns: [],
source: 'url'
};
const urlLower = url.toLowerCase();
// URL patterns that strongly indicate 404 status
const urlPatterns = [
'/status/404',
'/error/404',
'/404.html',
'/not-found',
'/page-not-found'
];
const detectedPatterns = urlPatterns.filter(pattern => urlLower.includes(pattern));
return {
detected: detectedPatterns.length > 0,
patterns: detectedPatterns,
source: 'url',
confidence: detectedPatterns.length > 0 ? 0.8 : 0.0
};
}
/**
* Detects if content contains 404 error patterns
* Now used for intelligent decision-making instead of blocking
*/
function detect404Error(content) {
if (!content || typeof content !== 'string') return {
detected: false,
patterns: []
};
const contentLower = content.toLowerCase();
// 404 indicator patterns
const patterns404 = [
'404: not found',
'error 404: not found',
'this page can\'t be found',
'page not found',
'lost in space',
'the page you\'re seeking might no longer exist',
'target url returned error 404',
'http 404',
'status: 404',
'this httpbin.org page can\'t be found'
];
const detectedPatterns = [];
// Check for 404 patterns
for (const pattern of patterns404) {
if (contentLower.includes(pattern)) {
detectedPatterns.push(pattern);
}
}
return {
detected: detectedPatterns.length > 0,
patterns: detectedPatterns,
confidence: Math.min(detectedPatterns.length / 3, 1.0)
};
}
/**
* Determines if a URL should get archive recovery attempts
*/
function shouldTryArchives(url, detectionResult, config) {
// Quick disable checks
if (!config.enabled) return false;
if (!detectionResult.detected) return true; // Not a 404, always try
// Probability check
if (Math.random() > config.archiveProbability) return false;
// High-value domain check (always try for these)
if (isHighValueDomain(url, config)) return true;
// Low-value pattern check (skip these unless aggressive mode)
if (isLowValueContent(url, config) && config.archiveProbability < 1.0) return false;
// Custom rules check
for (const [domain, rule] of Object.entries(config.customRules)) {
if (url.includes(domain)) {
return rule === 'always' || (rule === 'try' && Math.random() < 0.5);
}
}
return true;
}
/**
* Checks if URL is from a high-value domain that deserves archive recovery
*/
function isHighValueDomain(url, config) {
const urlLower = url.toLowerCase();
return config.highValueDomains.some(domain => urlLower.includes(domain));
}
/**
* Checks if URL is low-value content that doesn't need archive recovery
*/
function isLowValueContent(url, config) {
const urlLower = url.toLowerCase();
return config.lowValuePatterns.some(pattern => urlLower.includes(pattern));
}
/**
* Validates if extracted content is meaningful or just service error pages
*/
function validateMeaningfulContent(content, source = 'unknown') {
if (!content || typeof content !== 'string' || content.trim().length === 0) {
return {
isMeaningful: false,
reason: 'empty_content',
source
};
}
const contentLower = content.toLowerCase();
// Patterns that indicate non-meaningful content (error pages, "no results" pages, etc.)
const uselessPatterns = [
// Google Cache/Search error patterns
'did not match any documents',
'no cached version available',
'accessibility links',
'google apps',
'your search -',
'suggestions:',
'make sure all words are spelled correctly',
'footer links',
// Jina.ai error patterns
'jina ai reader',
'failed to extract content',
'extraction failed',
'unable to access',
'error 404',
'error 403',
'error 429',
'error 451',
'timeouterror',
'navigation timeout',
// Generic error patterns
'page not found',
'access denied',
'forbidden',
'rate limit',
'service unavailable',
'connection refused',
// Cache service error patterns
'wayback machine',
'archive.org',
'this page is not available',
'cached page',
'webcache.googleusercontent.com',
// Minimal content patterns
'title: cache:',
'url source:',
'markdown content:'
];
// Check for useless patterns
for (const pattern of uselessPatterns) {
if (contentLower.includes(pattern)) {
return {
isMeaningful: false,
reason: 'useless_pattern_detected',
pattern: pattern,
source
};
}
}
// Check for extremely short content (likely error pages)
const contentLength = content.trim().length;
if (contentLength < 100) {
return {
isMeaningful: false,
reason: 'content_too_short',
length: contentLength,
source
};
}
// Check for content that's mostly HTML/structure without meaningful text
const textContent = content.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
if (textContent.length < 50) {
return {
isMeaningful: false,
reason: 'insufficient_text_content',
textLength: textContent.length,
source
};
}
// Check for repetitive content (indicates error pages or broken extraction)
const words = textContent.split(' ').filter(w => w.length > 3);
const uniqueWords = new Set(words);
if (words.length > 10 && uniqueWords.size / words.length < 0.3) {
return {
isMeaningful: false,
reason: 'repetitive_content',
uniqueWordsRatio: uniqueWords.size / words.length,
source
};
}
return {
isMeaningful: true,
reason: 'meaningful_content_detected',
contentLength: contentLength,
textLength: textContent.length,
source
};
}
/**
* Determines the fallback level based on service used and number of attempts
*/
function determineFallbackLevel(service, totalAttempts) {
if (service === 'tavily') return 'primary';
if (service === 'jinaPublic') return 'secondary';
if (service === 'jinaAPI') return 'tertiary';
if (totalAttempts > 4) return 'ultra_resilient';
return 'unknown';
}
/**
* Determines the extraction strategy used
*/
function determineStrategy(isDoc, useEnhancedMetadata) {
if (useEnhancedMetadata) return 'tavily_first_enhanced_metadata';
if (isDoc) return 'tavily_first_optimal_fallback';
return 'tavily_first_default';
}
/**
* Checks if an IP address is in a private or reserved range.
* @param {string} ip - The IP address to check.
* @returns {boolean} - True if the IP is private, false otherwise.
*/
function isPrivateIP(ip) {
if (net.isIPv4(ip)) {
const parts = ip.split('.').map(part => parseInt(part, 10));
// 127.0.0.0/8 - Loopback
if (parts[0] === 127) return true;
// 10.0.0.0/8 - Private
if (parts[0] === 10) return true;
// 172.16.0.0/12 - Private
if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true;
// 192.168.0.0/16 - Private
if (parts[0] === 192 && parts[1] === 168) return true;
// 169.254.0.0/16 - Link-local (includes AWS metadata service)
if (parts[0] === 169 && parts[1] === 254) return true;
}
// No IPv6 checks for now as per requirements, but can be added.
return false;
}
/**
* Validates and normalizes malformed URLs before extraction
*/
async function validateAndNormalizeURL(url) {
const issues = [];
let normalizedURL = url;
// Check for double protocol issues
if (url.includes('http://https://') || url.includes('https://http://')) {
issues.push('double_protocol');
// Fix double protocol
normalizedURL = url.replace(/https?:\/\/https?:\/\//, 'https://');
}
// Check for spaces in URL (common issue from "textise dot iitty")
if (url.includes(' dot ') || url.includes(' ')) {
issues.push('spaces_in_domain');
// Try to fix common patterns
normalizedURL = normalizedURL.replace(/ dot /g, '.').replace(/\s+/g, '');
}
// Check for malformed Jina AI URLs
if (url.includes('r.jina.ai/http://') && !url.includes('r.jina.ai/http://https://')) {
issues.push('malformed_jina_url');
// This is actually the correct pattern for Jina AI
}
// Basic URL validation and SSRF Protection
let parsedURL;
try {
parsedURL = new URL(normalizedURL);
} catch (error) {
issues.push('invalid_url_format');
return {
valid: false,
issues,
error: `Invalid URL format: ${error.message}`,
originalURL: url,
normalizedURL: null
};
}
// SSRF Protection Step 1: Protocol check
if (parsedURL.protocol !== 'http:' && parsedURL.protocol !== 'https:') {
issues.push('invalid_protocol');
return {
valid: false,
issues,
error: `SSRF attack detected: Invalid protocol '${parsedURL.protocol}'. Only HTTP and HTTPS are allowed.`,
originalURL: url,
normalizedURL
};
}
const { hostname } = parsedURL;
// SSRF Protection Step 2: Hostname check
if (hostname === 'localhost' || hostname.endsWith('.local')) {
issues.push('forbidden_hostname');
return {
valid: false,
issues,
error: `SSRF attack detected: Hostname '${hostname}' is forbidden.`,
originalURL: url,
normalizedURL
};
}
// SSRF Protection Step 3: Resolve hostname to IP and check
let ipAddress;
if (net.isIP(hostname)) {
ipAddress = hostname;
} else {
try {
const { address } = await dns.lookup(hostname);
ipAddress = address;
} catch (error) {
issues.push('dns_lookup_failed');
return {
valid: false,
issues,
error: `DNS lookup failed for hostname: ${hostname}. ${error.message}`,
originalURL: url,
normalizedURL: null
};
}
}
if (isPrivateIP(ipAddress)) {
issues.push('private_ip_detected');
return {
valid: false,
issues,
error: `SSRF attack detected: IP address ${ipAddress} is in a forbidden range.`,
originalURL: url,
normalizedURL
};
}
// Check for obviously problematic domains that would cause API failures
const problematicPatterns = [
/textise dot iitty/i,
/textise\.iitty/i, // The normalized version is still invalid
/example dot com/i,
/example\.com$/i, // Generic example domain
/test dot /i,
/\.com\.[a-z]/i, // Likely malformed TLD
/r\.jina\.ai\/http:\/\/[^/]*\.[a-z]{2,}\/?$/i // Jina AI with obviously fake domain
];
for (const pattern of problematicPatterns) {
if (pattern.test(normalizedURL)) {
issues.push('suspicious_domain_pattern');
break;
}
}
// If we have suspicious patterns that can't be trusted, mark as invalid
if (issues.includes('suspicious_domain_pattern')) {
return {
valid: false,
issues,
error: `Unfixable URL issues: suspicious or test domain detected`,
originalURL: url,
normalizedURL: null
};
}
// If we have issues but can normalize, return the fixed version
if (issues.length > 0 && normalizedURL !== url) {
return {
valid: true,
issues,
originalURL: url,
normalizedURL,
hasFixes: true,
message: `URL normalized: ${issues.join(', ')}`
};
}
// If we have issues that can't be automatically fixed
if (issues.length > 0) {
return {
valid: false,
issues,
error: `Unfixable URL issues: ${issues.join(', ')}`,
originalURL: url,
normalizedURL: null
};
}
// URL is valid
return {
valid: true,
issues: [],
originalURL: url,
normalizedURL: url,
hasFixes: false
};
}
/**
* Performs comprehensive service health check
*/
async function performServiceHealthCheck() {
const healthStatus = {
tavily: { available: false, error: null },
jinaPublic: { available: false, error: null },
jinaAPI: { available: false, error: null }
};
// Check Tavily API
const tavilyValidation = await validateTavilyAPIKey();
healthStatus.tavily.available = tavilyValidation.valid;
healthStatus.tavily.error = tavilyValidation.reason;
// Check Jina Public
try {
const jinaTest = await fetch('https://r.jina.ai/http://example.com', {
method: 'GET',
signal: AbortSignal.timeout(5000)
});
healthStatus.jinaPublic.available = jinaTest.ok;
if (!jinaTest.ok) {
healthStatus.jinaPublic.error = `HTTP ${jinaTest.status}`;
}
} catch (error) {
healthStatus.jinaPublic.error = error.message;
}
// Check Jina API (if key is available)
if (JINA_API_KEY) {
try {
const jinaAPITest = await fetch('https://r.jina.ai/', {
method: 'POST',
headers: {
'Authorization': `Bearer ${JINA_API_KEY}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({ url: 'http://example.com' }),
signal: AbortSignal.timeout(5000)
});
healthStatus.jinaAPI.available = jinaAPITest.ok;
if (!jinaAPITest.ok) {
healthStatus.jinaAPI.error = `HTTP ${jinaAPITest.status}`;
}
} catch (error) {
healthStatus.jinaAPI.error = error.message;
}
} else {
healthStatus.jinaAPI.available = false;
healthStatus.jinaAPI.error = 'API key not configured';
}
return healthStatus;
}
/**
* Enhanced content extraction with optimal service selection strategy
*
* Strategy based on comprehensive research:
* 1. Always start with Tavily (100% success rate, 863ms fastest) - PRIMARY CHOICE
* 2. Documentation sites: Tavily First → Jina Public Fallback (better content for docs)
* 3. Cost tracking: Tavily First → Jina API Fallback (only for token tracking)
*/
export async function extractContent(url, options = {}) {
const startTime = Date.now();
const results = [];
// Perform service health check at the start
if (options.performHealthCheck !== false) {
log(`🔍 Performing service health check...`);
const healthStatus = await performServiceHealthCheck();
log(`📊 Service Health Status:`);
log(` Tavily: ${healthStatus.tavily.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.tavily.error}`);
log(` Jina Public: ${healthStatus.jinaPublic.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaPublic.error}`);
log(` Jina API: ${healthStatus.jinaAPI.available ? '✅ Available' : '❌ Unavailable - ' + healthStatus.jinaAPI.error}`);
// If no services are available, fail early
if (!healthStatus.tavily.available && !healthStatus.jinaPublic.available && !healthStatus.jinaAPI.available) {
return {
success: false,
error: { code: 'ALL_SERVICES_DOWN', message: 'All extraction services are unavailable' },
content: '',
contentLength: 0,
service: 'none',
url,
responseTime: Date.now() - startTime,
totalAttempts: 0,
totalResponseTime: Date.now() - startTime,
healthStatus,
metadata: {
extractionStrategy: 'all_services_failed',
timestamp: new Date().toISOString()
}
};
}
}
// Initialize 404 configuration for smart handling
const config404 = create404Config(options.config404 || { mode: 'normal' });
log(`🎯 404 Handling: ${config404.description}`);
// Pre-validate and normalize URL before extraction
const urlValidation = await validateAndNormalizeURL(url);
let extractionURL = url;
if (!urlValidation.valid) {
log(`❌ URL validation failed: ${urlValidation.error}`);
return {
success: false,
error: {
code: 'INVALID_URL',
message: urlValidation.error,
issues: urlValidation.issues
},
content: '',
contentLength: 0,
service: 'validation',
url,
responseTime: Date.now() - startTime,
totalAttempts: 0,
totalResponseTime: Date.now() - startTime,
metadata: {
extractionStrategy: 'url_validation_failed',
timestamp: new Date().toISOString(),
originalURL: urlValidation.originalURL,
validationIssues: urlValidation.issues
}
};
}
if (urlValidation.hasFixes) {
log(`🔧 URL normalized: ${urlValidation.message}`);
log(` Original: ${urlValidation.originalURL}`);
log(` Normalized: ${urlValidation.normalizedURL}`);
extractionURL = urlValidation.normalizedURL;
}
// Determine optimal strategy based on URL characteristics
const isDoc = isDocumentationSite(extractionURL);
const isProblematic = isProblematicDomain(extractionURL);
const useEnhancedMetadata = options.enhancedMetadata || options.highVolume;
log(`🎯 Extracting content from: ${extractionURL}`);
if (extractionURL !== url) {
log(` (Original URL: ${url})`);
}
log(` URL Type: ${isDoc ? 'Documentation site' : isProblematic ? 'Problematic domain' : 'General URL'}`);
log(` Enhanced Metadata: ${useEnhancedMetadata ? 'enabled' : 'disabled'}`);
let result;
// Strategy 1: Always start with Tavily (research shows it's fastest and most reliable)
log(`🚀 Using Tavily first...`);
try {
result = await extractWithTavily(extractionURL, options);
results.push(result);
} catch (error) {
result = {
success: false,
error: { code: 'EXCEPTION', message: error.message },
service: 'tavily',
url: extractionURL,
originalURL: url,
responseTime: Date.now() - startTime,
content: '',
contentLength: 0
};
results.push(result);
log(`❌ Tavily extraction failed with exception: ${error.message}`);
}
// Determine fallback service based on specific needs and service availability
let fallbackService = 'jinaPublic'; // Default fallback
let fallbackReason = 'default';
if (useEnhancedMetadata && JINA_API_KEY) {
fallbackService = 'jinaAPI';
fallbackReason = 'enhanced metadata requested';
} else if (isDoc) {
fallbackService = 'jinaPublic';
fallbackReason = 'documentation site';
}
// Enhanced fallback logic with better error detection
const needsFallback = !result.success ||
result.error?.code === '401' || // Invalid API key
result.error?.code === '403' || // Forbidden
result.error?.code === '429' || // Rate limited
result.error?.code === 'EXCEPTION' || // Exception occurred
(result.contentLength === 0 && !options.skipEmptyFallback) ||
(useEnhancedMetadata && !result.success);
if (needsFallback) {
log(`⚠️ Tavily failed or returned empty, trying ${fallbackService} (${fallbackReason})...`);
log(` Failure reason: ${result.error?.code || result.error?.message || 'Empty content'}`);
let fallbackResult;
try {
if (fallbackService === 'jinaAPI' && JINA_API_KEY) {
fallbackResult = await extractWithJinaAPI(extractionURL, options);
} else {
fallbackResult = await extractWithJinaPublic(extractionURL, options);
}
results.push(fallbackResult);
// Use fallback if it succeeded
if (fallbackResult.success && (fallbackResult.contentLength > 0 || useEnhancedMetadata)) {
result = fallbackResult;
log(`✅ Fallback to ${fallbackService} successful`);
// Smart 404 detection for logging and metrics
const detection404 = detect404Error(result.content);
if (detection404.detected) {
log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`);
result.fallback404Detection = detection404;
}
} else {
log(`❌ Fallback to ${fallbackService} failed: ${fallbackResult.error?.message || 'Empty content'}`);
}
} catch (error) {
log(`❌ Fallback to ${fallbackService} failed with exception: ${error.message}`);
fallbackResult = {
success: false,
error: { code: 'EXCEPTION', message: error.message },
service: fallbackService,
url: extractionURL,
originalURL: url,
responseTime: Date.now() - startTime,
content: '',
contentLength: 0
};
results.push(fallbackResult);
}
}
// Final fallback if needed (try the remaining service)
if ((!result.success || result.contentLength === 0) && !useEnhancedMetadata && JINA_API_KEY) {
const finalService = fallbackService === 'jinaPublic' ? 'jinaAPI' : 'jinaPublic';
log(`🔄 Final fallback to ${finalService}...`);
try {
const finalFallback = finalService === 'jinaAPI' ?
await extractWithJinaAPI(extractionURL, options) :
await extractWithJinaPublic(extractionURL, options);
results.push(finalFallback);
if (finalFallback.success && finalFallback.contentLength > 0) {
result = finalFallback;
log(`✅ Final fallback to ${finalService} successful`);
// Smart 404 detection for logging and metrics
const detection404 = detect404Error(result.content);
if (detection404.detected) {
log(`🔍 404 patterns detected: ${detection404.patterns.join(', ')}`);
result.finalFallback404Detection = detection404;
}
} else {
log(`❌ Final fallback to ${finalService} failed`);
}
} catch (error) {
log(`❌ Final fallback to ${finalService} failed with exception: ${error.message}`);
results.push({
success: false,
error: { code: 'EXCEPTION', message: error.message },
service: finalService,
url: extractionURL,
originalURL: url,
responseTime: Date.now() - startTime,
content: '',
contentLength: 0
});
}
}
// Ultra-resilient fallback: Try pattern-based alternative approaches if all standard services failed
// Use smart 404 configuration to decide whether to attempt recovery
if (!result.success || result.contentLength === 0) {
// Get the best 404 detection result we have
let detection404 = result.fallback404Detection || result.finalFallback404Detection || { detected: false };
// If no content-based detection worked, try URL-based detection
if (!detection404.detected) {
detection404 = detect404FromURL(extractionURL);
}
// Determine if we should try archive recovery
const shouldTry = shouldTryArchives(extractionURL, detection404, config404);
if (shouldTry) {
log(`🚨 Trying ultra-resilient fallbacks with 404 configuration...`);
log(` 404 detected: ${detection404.detected} (source: ${detection404.source || 'content'}), Archive probability: ${config404.archiveProbability}`);
// Pass 404 config to the ultra-resilient fallback system
const ultraResilientOptions = {
...options,
config404,
maxArchiveAttempts: config404.maxArchiveAttempts
};
const ultraResilientResult = await tryUltraResilientFallbacks(extractionURL, ultraResilientOptions, results);
if (ultraResilientResult.success) {
result = ultraResilientResult.result;
results.push(ultraResilientResult.result);
log(`✅ Ultra-resilient fallback successful via ${ultraResilientResult.result.service}`);
} else {
log(`❌ Ultra-resilient fallbacks also failed`);
}
} else {
if (detection404.detected) {
log(`⏭️ Skipping ultra-resilient fallbacks (404 detected, configuration: ${config404.mode})`);
} else {
log(`⏭️ Skipping ultra-resilient fallbacks (disabled by configuration)`);
}
}
}
const totalTime = Date.now() - startTime;
// Return the successful result or the last attempted result
// But only consider it successful if at least one service actually worked
const hasAnySuccessfulService = results.some(r => r.success && (r.contentLength > 0 || useEnhancedMetadata));
const successfulResult = hasAnySuccessfulService ?
results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata)) :
result;
// Validate if the content is actually meaningful
const contentValidation = validateMeaningfulContent(successfulResult.content, successfulResult.service);
// Detect 404 patterns for metrics and intelligent handling
const detection404 = detect404Error(successfulResult.content);
// Determine honest success metrics
const technicalSuccess = successfulResult.success && successfulResult.contentLength > 0;
const meaningfulSuccess = technicalSuccess && contentValidation.isMeaningful;
const fallbackLevel = determineFallbackLevel(successfulResult.service, results.length);
// Log content validation results for debugging
if (technicalSuccess && !meaningfulSuccess) {
log(`⚠️ Technical success but content validation failed:`);
log(` Reason: ${contentValidation.reason}${contentValidation.pattern ? ` (${contentValidation.pattern})` : ''}`);
log(` Source: ${contentValidation.source}`);
} else if (meaningfulSuccess) {
log(`✅ Meaningful content extracted successfully (${contentValidation.contentLength} chars)`);
}
const finalResult = {
...successfulResult,
// Legacy success field (for backwards compatibility)
success: hasAnySuccessfulService,
// Enhanced success reporting
technicalSuccess,
meaningfulSuccess,
contentValidation,
fallbackLevel,
totalAttempts: results.length,
totalResponseTime: totalTime,
strategy: {
isDocumentationSite: isDoc,
isProblematicDomain: isProblematic,
enhancedMetadataEnabled: useEnhancedMetadata,
primaryService: 'tavily', // ALWAYS Tavily first
fallbackService,
fallbackReason
},
allResults: results,
metadata: {
...successfulResult.metadata,
extractionStrategy: 'tavily_first_optimal_fallback',
timestamp: new Date().toISOString(),
totalTokensUsed: results.reduce((sum, r) => sum + (r.metadata?.tokenUsage || 0), 0),
urlValidation: {
originalURL: url,
normalizedURL: extractionURL,
wasNormalized: urlValidation.hasFixes,
validationIssues: urlValidation.issues,
validationMessage: urlValidation.message
},
allServicesFailed: !hasAnySuccessfulService,
ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0,
attemptedServices: results.map(r => r.service),
successfulService: hasAnySuccessfulService ? results.find(r => r.success && (r.contentLength > 0 || useEnhancedMetadata))?.service : null,
// New meaningful content metrics
honestSuccessMetrics: {
technicalSuccess,
meaningfulSuccess,
fallbackLevel,
contentQuality: contentValidation.isMeaningful ? 'meaningful' : 'useless',
contentIssues: contentValidation.isMeaningful ? null : {
reason: contentValidation.reason,
pattern: contentValidation.pattern,
source: contentValidation.source
},
// 404 handling metrics
handling404: {
detected404: detection404?.detected || false,
fourOFourPatterns: (detection404 && detection404.patterns) ? detection404.patterns : [],
fourOFourConfidence: detection404?.confidence || 0,
attemptedArchives: shouldTryArchives(extractionURL, detection404 || { detected: false }, config404),
archiveMode: config404.mode,
archiveProbability: config404.archiveProbability,
maxArchiveAttempts: config404.maxArchiveAttempts,
isHighValueDomain: isHighValueDomain(extractionURL, config404),
isLowValueContent: isLowValueContent(extractionURL, config404)
}
}
}
};
// If all services failed, add appropriate error information
if (!hasAnySuccessfulService) {
finalResult.error = {
code: 'ALL_SERVICES_FAILED',
message: 'All extraction services failed to retrieve content',
attempts: results.length,
serviceResults: results.map(r => ({ service: r.service, success: r.success, error: r.error?.code })),
ultraResilientAttempts: results.length > 3 ? results.length - 3 : 0
};
}
return finalResult;
}
/**
* Performs a search using the Tavily API with enhanced error handling
* @param {Object} params - Search parameters
* @param {number} timeoutMs - Request timeout in milliseconds
* @returns {Object} Search results
*/
export const tavily = {
search: async function tavilySearch(params, timeoutMs = 15000) {
const startTime = Date.now();
if (!TAVILY_API_KEY) {
throw new Error('Tavily API key not configured');
}
// Construct the request payload
const requestBody = {
api_key: TAVILY_API_KEY,
query: params.query,
max_results: params.maxResults || 5,
include_answer: params.includeAnswer !== false, // Default to true
include_raw_content: params.includeRawContent || false,
num_days: params.numDays || 30, // Look back 30 days by default
};
// Add headers if provided
const headers = {
'Content-Type': 'application/json',
...params.headers
};
try {
// Create AbortController for timeout handling
const controller = new AbortController();
const timeoutId = setTimeout(timeoutMs, null).then(() => {
controller.abort();
});
// Make the API request
const response = await fetch('https://api.tavily.com/search', {
method: 'POST',
headers,
body: JSON.stringify(requestBody),
signal: controller.signal
});
// Clear the timeout if the request completes in time
clearTimeout(timeoutId);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(`Tavily API error: ${response.status} - ${errorData.error || response.statusText}`);
}
const data = await response.json();
return data;
} catch (error) {
if (error.name === 'AbortError') {
throw new Error(`Request timeout after ${timeoutMs}ms`);
} else if (error.code === 'ECONNREFUSED') {
throw new Error(`Connection refused when trying to reach Tavily API: ${error.message}`);
} else {
throw error;
}
}
}
};
/**
* Simple logging function (can be replaced with proper logging)
*/
function log(message) {
console.log(`[ContentExtractor] ${message}`);
}
/**
* Batch content extraction for multiple URLs
*/
export async function extractContentBatch(urls, options = {}) {
const results = [];
const concurrency = options.concurrency || 3;
log(`📦 Batch extracting ${urls.length} URLs with concurrency ${concurrency}`);
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchPromises = batch.map(url => extractContent(url, options));
const batchResults = await Promise.allSettled(batchPromises);
batchResults.forEach((result, index) => {
const url = batch[index];
if (result.status === 'fulfilled') {
results.push({ url, ...result.value });
} else {
results.push({
url,
success: false,
error: { code: 'BATCH_ERROR', message: result.reason.message },
content: '',
contentLength: 0,
service: 'batch_failed'
});
}
});
// Small delay between batches to be respectful to rate limits
if (i + concurrency < urls.length) {
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
const successCount = results.filter(r => r.success && (r.contentLength > 0 || options.enhancedMetadata)).length;
log(`✅ Batch extraction complete: ${successCount}/${urls.length} successful`);
return {
results,
summary: {
total: urls.length,
successful: successCount,
failed: urls.length - successCount,
successRate: Math.round((successCount / urls.length) * 100)
}
};
}
export default {
extractContent,
extractContentBatch,
tavily,
SERVICES,
isDocumentationSite,
isProblematicDomain
};