Initial commit
This commit is contained in:
1706
hooks/content-extractor.mjs
Normal file
1706
hooks/content-extractor.mjs
Normal file
File diff suppressed because it is too large
Load Diff
117
hooks/handle-rate-limit.mjs
Normal file
117
hooks/handle-rate-limit.mjs
Normal file
@@ -0,0 +1,117 @@
|
||||
// hooks/handle-rate-limit.mjs
|
||||
import contentExtractor from './content-extractor.mjs';
|
||||
|
||||
/**
|
||||
* Handles rate limiting scenarios
|
||||
* @param {Object} error - The rate limit error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Results after handling rate limit
|
||||
*/
|
||||
export async function handleRateLimit(error, options) {
|
||||
console.log('Handling rate limit error...');
|
||||
|
||||
// Extract retry-after header if available, or use default delay
|
||||
let delay = 60000; // Default to 1 minute
|
||||
|
||||
if (error.response && error.response.headers && error.response.headers['retry-after']) {
|
||||
const retryAfter = parseInt(error.response.headers['retry-after'], 10);
|
||||
if (!isNaN(retryAfter)) {
|
||||
delay = retryAfter * 1000; // Convert to milliseconds
|
||||
}
|
||||
}
|
||||
|
||||
// Apply jitter to avoid thundering herd problem
|
||||
const jitter = Math.random() * 5000; // Up to 5 seconds
|
||||
delay += jitter;
|
||||
|
||||
console.log(`Waiting ${Math.round(delay/1000)} seconds before retrying due to rate limiting...`);
|
||||
|
||||
try {
|
||||
// Wait for the required time
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
|
||||
// Try again with modified parameters to reduce load
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
headers: generateRateLimitHeaders(),
|
||||
maxResults: Math.max(1, Math.floor((options.maxResults || 5) / 2)) // Reduce number of results
|
||||
};
|
||||
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results after handling rate limit'
|
||||
};
|
||||
|
||||
} catch (retryError) {
|
||||
// If still rate limited, try with even more conservative parameters
|
||||
try {
|
||||
// Wait an additional time
|
||||
await new Promise(resolve => setTimeout(resolve, 120000)); // 2 minutes
|
||||
|
||||
const conservativeParams = {
|
||||
...options,
|
||||
headers: generateVeryConservativeHeaders(),
|
||||
maxResults: 1, // Get just one result
|
||||
query: simplifyQuery(options.query)
|
||||
};
|
||||
|
||||
const results = await contentExtractor.tavily.search(conservativeParams);
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results with conservative approach after rate limiting'
|
||||
};
|
||||
} catch (finalError) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Rate limit handling failed after multiple attempts: ${finalError.message}`
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate headers that are less likely to trigger rate limits
|
||||
* @returns {Object} Conservative headers
|
||||
*/
|
||||
function generateRateLimitHeaders() {
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'DNT': '1'
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate very conservative headers
|
||||
* @returns {Object} Very conservative headers
|
||||
*/
|
||||
function generateVeryConservativeHeaders() {
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBot/1.0; +http://archive.org/details/archivebot)',
|
||||
'Accept': 'text/html',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'identity', // Don't request compression to reduce processing load
|
||||
'Connection': 'close',
|
||||
'Cache-Control': 'max-age=0'
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplifies a query to reduce complexity
|
||||
* @param {string} query - Original query
|
||||
* @returns {string} Simplified query
|
||||
*/
|
||||
function simplifyQuery(query) {
|
||||
// Remove complex terms that might trigger more intensive processing
|
||||
return query
|
||||
.replace(/\b(how to|guide to|tutorial for)\b/gi, '')
|
||||
.replace(/\b(detailed|comprehensive|complete)\b/gi, '')
|
||||
.trim();
|
||||
}
|
||||
887
hooks/handle-search-error.mjs
Normal file
887
hooks/handle-search-error.mjs
Normal file
@@ -0,0 +1,887 @@
|
||||
// hooks/handle-search-error.mjs
|
||||
import contentExtractor from './content-extractor.mjs';
|
||||
import { handleRateLimit } from './handle-rate-limit.mjs';
|
||||
|
||||
// ============================================================================
|
||||
// CONFIGURATION: Recovery strategy timeout
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Recovery strategy timeout in milliseconds
|
||||
* Environment variable: SEARCH_PLUS_RECOVERY_TIMEOUT_MS
|
||||
* Default: 5000ms (5 seconds) - based on project requirements for <5s average recovery
|
||||
*/
|
||||
const RECOVERY_TIMEOUT_MS = validateRecoveryTimeout(process.env.SEARCH_PLUS_RECOVERY_TIMEOUT_MS || '5000');
|
||||
|
||||
/**
|
||||
* Validates recovery timeout configuration value
|
||||
* @param {string} value - The timeout value to validate
|
||||
* @returns {number} Validated timeout in milliseconds
|
||||
*/
|
||||
function validateRecoveryTimeout(value) {
|
||||
const parsed = parseInt(value, 10);
|
||||
|
||||
// Check if value is a valid number
|
||||
if (isNaN(parsed)) {
|
||||
console.warn(`⚠️ Invalid SEARCH_PLUS_RECOVERY_TIMEOUT_MS: "${value}". Using default 5000ms.`);
|
||||
return 5000;
|
||||
}
|
||||
|
||||
// Check for reasonable bounds (100ms to 60s)
|
||||
if (parsed < 100) {
|
||||
console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too low: ${parsed}ms. Minimum is 100ms. Using 100ms.`);
|
||||
return 100;
|
||||
}
|
||||
|
||||
if (parsed > 60000) {
|
||||
console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too high: ${parsed}ms. Maximum is 60000ms. Using 60000ms.`);
|
||||
return 60000;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
// Log configuration in development mode
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.log(`🔧 Search-Plus Recovery Timeout: ${RECOVERY_TIMEOUT_MS}ms`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Standardized error response helper
|
||||
* @param {string} strategy - Name of the strategy that failed
|
||||
* @param {Error|string} error - The error that occurred
|
||||
* @param {number} startTime - Strategy start timestamp
|
||||
* @param {Object} additionalInfo - Additional context info
|
||||
* @returns {Object} Standardized error response
|
||||
*/
|
||||
function createStandardErrorResponse(strategy, error, startTime, additionalInfo = {}) {
|
||||
const responseTime = Date.now() - startTime;
|
||||
const errorMessage = error instanceof Error ? error.message : error;
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: errorMessage,
|
||||
strategy: strategy,
|
||||
responseTime: responseTime,
|
||||
timestamp: new Date().toISOString(),
|
||||
...additionalInfo
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Standardized success response helper
|
||||
* @param {string} strategy - Name of the strategy that succeeded
|
||||
* @param {*} data - The data returned by the strategy
|
||||
* @param {number} startTime - Strategy start timestamp
|
||||
* @param {Object} additionalInfo - Additional context info
|
||||
* @returns {Object} Standardized success response
|
||||
*/
|
||||
function createStandardSuccessResponse(strategy, data, startTime, additionalInfo = {}) {
|
||||
const responseTime = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: data,
|
||||
strategy: strategy,
|
||||
responseTime: responseTime,
|
||||
timestamp: new Date().toISOString(),
|
||||
...additionalInfo
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles web search errors with advanced recovery strategies
|
||||
* @param {Object} error - The error object
|
||||
* @param {Object} options - Search options that caused the error
|
||||
* @returns {Object} Recovery results or final error
|
||||
*/
|
||||
export async function handleWebSearchError(error, options) {
|
||||
console.log('Handling search error:', error);
|
||||
|
||||
// Check error type and apply appropriate recovery strategy
|
||||
if (error.code === 403 || error.message.includes('403') || error.message.toLowerCase().includes('forbidden')) {
|
||||
return await handle403Error(error, options);
|
||||
}
|
||||
else if (error.code === 451 || error.message.includes('451') || error.message.toLowerCase().includes('securitycompromise') || error.message.toLowerCase().includes('blocked until')) {
|
||||
return await handle451SecurityError(error, options);
|
||||
}
|
||||
else if (error.code === 422 || error.message.includes('422') || is422SchemaError(error)) {
|
||||
return await handle422Error(error, options);
|
||||
}
|
||||
else if (error.code === 429 || error.message.includes('429') || error.message.toLowerCase().includes('rate limit')) {
|
||||
return await handleRateLimit(error, options);
|
||||
}
|
||||
else if (error.code === 'ECONNREFUSED' || error.message.toLowerCase().includes('connection refused')) {
|
||||
return await handleConnectionRefusedError(error, options);
|
||||
}
|
||||
else if (error.code === 'ETIMEDOUT' || error.message.toLowerCase().includes('timeout')) {
|
||||
return await handleTimeoutError(error, options);
|
||||
}
|
||||
else {
|
||||
// For other errors, return the original error
|
||||
return {
|
||||
error: true,
|
||||
message: `Search failed: ${error.message}`,
|
||||
code: error.code
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles 403 Forbidden errors
|
||||
* @param {Object} error - The 403 error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handle403Error(error, options) {
|
||||
console.log('Handling 403 error - trying with different headers...');
|
||||
|
||||
try {
|
||||
// Try again with completely different headers
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
headers: generateDiverseHeaders()
|
||||
};
|
||||
|
||||
// Add a delay before retrying
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results after handling 403 error'
|
||||
};
|
||||
|
||||
} catch (retryError) {
|
||||
console.log('403 retry failed, trying alternative approach...');
|
||||
|
||||
// Try with a different search query formulation
|
||||
try {
|
||||
const reformulatedQuery = reformulateQuery(options.query);
|
||||
const results = await contentExtractor.tavily.search({ ...options, query: reformulatedQuery });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results with reformulated query after 403 error'
|
||||
};
|
||||
} catch (finalError) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to retrieve results after handling 403 error: ${finalError.message}`
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles 451 SecurityCompromiseError (domain blocked due to abuse)
|
||||
* Uses parallel execution with enhanced UX logging
|
||||
* @param {Object} error - The 451 error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handle451SecurityError(error, options) {
|
||||
const blockedDomain = extractBlockedDomain(error.message);
|
||||
|
||||
// Simple mode for power users who want minimal output
|
||||
if (process.env.SEARCH_PLUS_451_SIMPLE_MODE === 'true') {
|
||||
return await handleSimple451Recovery(error, options, blockedDomain);
|
||||
}
|
||||
|
||||
// Enhanced UX logging by default
|
||||
console.log('🚫 451 SecurityCompromiseError detected');
|
||||
console.log(`📍 Blocked domain: ${blockedDomain || 'unknown'}`);
|
||||
console.log('🚀 Starting parallel recovery:');
|
||||
console.log(' 🛡️ Strategy 1: Domain exclusion');
|
||||
console.log(' 🔍 Strategy 2: Alternative sources');
|
||||
|
||||
// Optimized parallel execution using the two most effective strategies
|
||||
const strategies = [
|
||||
searchWithExcludedDomainUnified(options, blockedDomain, true),
|
||||
tryAlternativeSearchSources(options, true)
|
||||
];
|
||||
|
||||
try {
|
||||
const results = await Promise.any(strategies);
|
||||
console.log(`✅ Success! Used strategy: ${results.strategy} (${results.responseTime}ms)`);
|
||||
|
||||
// Provide actionable suggestions for future searches
|
||||
if (blockedDomain) {
|
||||
console.log(`💡 Next time, try: /search-plus "${options.query} -site:${blockedDomain}"`);
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: results.data,
|
||||
message: `Successfully retrieved results using ${results.strategy} for blocked domain ${blockedDomain || 'unknown'}`,
|
||||
strategy: results.strategy,
|
||||
responseTime: results.responseTime,
|
||||
blockedDomain: blockedDomain
|
||||
};
|
||||
|
||||
} catch (aggregateError) {
|
||||
// Enhanced error classification and user guidance
|
||||
const failureType = classify451Failure(aggregateError, blockedDomain, options);
|
||||
console.log(`❌ All recovery strategies failed`);
|
||||
console.log(`🔍 Error type: ${failureType.type}`);
|
||||
|
||||
if (failureType.suggestions.length > 0) {
|
||||
console.log('💡 Suggestions:');
|
||||
failureType.suggestions.forEach((suggestion, i) => {
|
||||
console.log(` ${i + 1}. ${suggestion.description}`);
|
||||
});
|
||||
}
|
||||
|
||||
return generateEnhancedErrorResponse(failureType, blockedDomain, options);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles 451 errors in simple mode with minimal output
|
||||
* @param {Object} error - The 451 error
|
||||
* @param {Object} options - Search options
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handleSimple451Recovery(error, options, blockedDomain) {
|
||||
console.log('⚡ 451 error - attempting recovery...');
|
||||
|
||||
const strategies = [
|
||||
searchWithExcludedDomainUnified(options, blockedDomain, true),
|
||||
tryAlternativeSearchSources(options, true)
|
||||
];
|
||||
|
||||
try {
|
||||
const results = await Promise.any(strategies);
|
||||
console.log(`⚡ 451 recovered in ${results.responseTime}ms`);
|
||||
return results;
|
||||
} catch (aggregateError) {
|
||||
console.log('❌ 451 recovery failed');
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to recover from 451 error. Domain ${blockedDomain || 'unknown'} is blocked.`,
|
||||
blockedDomain: blockedDomain
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Classifies 451 failure types for enhanced error handling
|
||||
* @param {AggregateError} aggregateError - The combined error from failed strategies
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Failure classification with suggestions
|
||||
*/
|
||||
function classify451Failure(aggregateError, blockedDomain, options) {
|
||||
// Check for permanent block patterns
|
||||
if (aggregateError.errors.some(err => err.message.includes('blocked until'))) {
|
||||
return {
|
||||
type: 'permanent-block',
|
||||
suggestions: [
|
||||
{
|
||||
type: 'ready-to-run',
|
||||
command: `/search-plus "${options.query} -site:${blockedDomain}"`,
|
||||
description: 'Exclude blocked domain and search again'
|
||||
},
|
||||
{
|
||||
type: 'manual-search',
|
||||
url: `https://www.google.com/search?q=${encodeURIComponent(options.query)}`,
|
||||
description: 'Search manually in external browser'
|
||||
}
|
||||
],
|
||||
autoSuggestion: {
|
||||
message: 'For more predictable results, enable simple 451 handling?',
|
||||
command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true',
|
||||
benefit: 'Provides clear guidance instead of complex automation'
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Default classification
|
||||
return {
|
||||
type: 'recovery-failed',
|
||||
suggestions: [
|
||||
{
|
||||
type: 'ready-to-run',
|
||||
command: `/search-plus "${options.query} -site:${blockedDomain}"`,
|
||||
description: 'Try again excluding the blocked domain'
|
||||
}
|
||||
],
|
||||
autoSuggestion: {
|
||||
message: 'Want simpler error handling?',
|
||||
command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true',
|
||||
benefit: 'Minimal output with focus on results'
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates enhanced error response with actionable suggestions
|
||||
* @param {Object} failureType - The classified failure type
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Enhanced error response
|
||||
*/
|
||||
function generateEnhancedErrorResponse(failureType, blockedDomain, options) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to retrieve results after handling 451 SecurityCompromiseError. Domain ${blockedDomain || 'unknown'} is blocked.`,
|
||||
blockedDomain: blockedDomain,
|
||||
failureType: failureType.type,
|
||||
suggestions: failureType.suggestions,
|
||||
autoSuggestion: failureType.autoSuggestion
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the blocked domain from error message
|
||||
* @param {string} errorMessage - The error message
|
||||
* @returns {string|null} The blocked domain or null if not found
|
||||
*/
|
||||
function extractBlockedDomain(errorMessage) {
|
||||
const domainMatch = errorMessage.match(/domain (\S+) blocked/i) ||
|
||||
errorMessage.match(/access to (\S+) blocked/i);
|
||||
return domainMatch ? domainMatch[1] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the block expiration date from error message
|
||||
* @param {string} errorMessage - The error message
|
||||
* @returns {string|null} The block expiration date or null if not found
|
||||
*/
|
||||
function extractBlockUntilDate(errorMessage) {
|
||||
// Look for "blocked until" followed by a date, capturing until the next reason or end
|
||||
const dateMatch = errorMessage.match(/blocked until (.+?)(?:\s+due|$)/i);
|
||||
return dateMatch ? dateMatch[1].trim() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative search sources with configurable optimization level
|
||||
* @param {Object} options - Original search options
|
||||
* @param {boolean} optimized - Whether to use optimized timeouts for parallel execution
|
||||
* @returns {Promise<Object>} Search results from alternative sources
|
||||
*/
|
||||
async function tryAlternativeSearchSources(options, optimized = false) {
|
||||
const startTime = Date.now();
|
||||
const strategyName = 'alternative-search-sources';
|
||||
const timeout = optimized ? 1500 : RECOVERY_TIMEOUT_MS;
|
||||
|
||||
try {
|
||||
console.log(optimized ? '🔍 Trying alternative search sources...' : 'Trying alternative search sources...');
|
||||
const blockedDomain = optimized ? (options.blockedDomain || null) : (options.error ? extractBlockedDomain(options.error.message || '') : null);
|
||||
const domainFilter = blockedDomain ? `-site:${blockedDomain}` : '';
|
||||
const modifiedQuery = `${options.query} ${domainFilter} alternative OR substitute OR replacement`.trim();
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
query: modifiedQuery,
|
||||
include_answer: true,
|
||||
max_results: Math.min(options.max_results || 10, 8)
|
||||
};
|
||||
|
||||
if (optimized) {
|
||||
// Create AbortController for proper timeout cleanup in optimized mode
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => {
|
||||
abortController.abort();
|
||||
}, timeout);
|
||||
|
||||
try {
|
||||
const searchPromise = contentExtractor.tavily.search({
|
||||
...modifiedParams,
|
||||
signal: abortController.signal
|
||||
});
|
||||
const results = await searchPromise;
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
return createStandardSuccessResponse(strategyName, results, startTime);
|
||||
} catch (searchError) {
|
||||
if (searchError.name === 'AbortError') {
|
||||
throw new Error('Strategy timeout');
|
||||
}
|
||||
throw searchError;
|
||||
}
|
||||
} else {
|
||||
// Standard mode with timeout promise
|
||||
const strategyPromise = contentExtractor.tavily.search(modifiedParams);
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout);
|
||||
});
|
||||
|
||||
return await Promise.race([strategyPromise, timeoutPromise]);
|
||||
}
|
||||
} catch (error) {
|
||||
return createStandardErrorResponse(strategyName, error, startTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Domain exclusion search with configurable optimization level
|
||||
* @param {Object} options - Original search options
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @param {boolean} optimized - Whether to use optimized timeouts for parallel execution
|
||||
* @returns {Promise<Object>} Search results
|
||||
*/
|
||||
async function searchWithExcludedDomainUnified(options, blockedDomain, optimized = false) {
|
||||
const startTime = Date.now();
|
||||
const strategyName = 'excluded-domain-search';
|
||||
const timeout = optimized ? 1000 : RECOVERY_TIMEOUT_MS;
|
||||
|
||||
try {
|
||||
if (!blockedDomain) {
|
||||
return createStandardErrorResponse(strategyName, 'No blocked domain to exclude', startTime);
|
||||
}
|
||||
|
||||
console.log(optimized ? `🛡️ Excluding domain: ${blockedDomain}` : `Searching while excluding domain: ${blockedDomain}`);
|
||||
const exclusionQuery = `${options.query} -site:${blockedDomain}`;
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
query: exclusionQuery,
|
||||
headers: generateDiverseHeaders()
|
||||
};
|
||||
|
||||
if (optimized) {
|
||||
// Create AbortController for proper timeout cleanup in optimized mode
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => {
|
||||
abortController.abort();
|
||||
}, timeout);
|
||||
|
||||
try {
|
||||
const searchPromise = contentExtractor.tavily.search({
|
||||
...modifiedParams,
|
||||
signal: abortController.signal
|
||||
});
|
||||
const results = await searchPromise;
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
return createStandardSuccessResponse(strategyName, results, startTime);
|
||||
} catch (searchError) {
|
||||
if (searchError.name === 'AbortError') {
|
||||
throw new Error('Strategy timeout');
|
||||
}
|
||||
throw searchError;
|
||||
}
|
||||
} else {
|
||||
// Standard mode with timeout promise and delay
|
||||
const strategyPromise = (async () => {
|
||||
await new Promise(resolve => setTimeout(resolve, 3000));
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
return createStandardSuccessResponse(strategyName, results, startTime);
|
||||
})();
|
||||
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout);
|
||||
});
|
||||
|
||||
return await Promise.race([strategyPromise, timeoutPromise]);
|
||||
}
|
||||
} catch (error) {
|
||||
return createStandardErrorResponse(strategyName, error, startTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reformulates query to avoid references to blocked domains
|
||||
* @param {Object} options - Original search options
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function reformulateQueryAvoidingBlockedDomain(options, blockedDomain) {
|
||||
const startTime = Date.now();
|
||||
const strategyName = 'reformulate-query';
|
||||
|
||||
const strategyPromise = (async () => {
|
||||
try {
|
||||
console.log('Reformulating query to avoid blocked domain references...');
|
||||
let reformulatedQuery = options.query;
|
||||
if (blockedDomain) {
|
||||
const domainMappings = {
|
||||
'httpbin.org': 'HTTP testing API endpoint service',
|
||||
'github.com': 'code repository platform',
|
||||
'stackoverflow.com': 'programming Q&A website',
|
||||
'medium.com': 'blogging platform'
|
||||
};
|
||||
const genericTerm = domainMappings[blockedDomain] || 'online service';
|
||||
reformulatedQuery = options.query.replace(new RegExp(blockedDomain, 'gi'), genericTerm);
|
||||
}
|
||||
const modifiedParams = { ...options, query: reformulatedQuery, search_depth: "basic" };
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 2500));
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
|
||||
return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime };
|
||||
} catch (error) {
|
||||
return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime };
|
||||
}
|
||||
})();
|
||||
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve({
|
||||
success: false,
|
||||
error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`,
|
||||
strategy: strategyName,
|
||||
responseTime: Date.now() - startTime
|
||||
}), RECOVERY_TIMEOUT_MS);
|
||||
});
|
||||
|
||||
return Promise.race([strategyPromise, timeoutPromise]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to use cached or archived results for blocked content
|
||||
* @param {Object} options - Original search options
|
||||
* @param {string} blockedDomain - The blocked domain
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function useCachedOrArchiveResults(options, blockedDomain) {
|
||||
const startTime = Date.now();
|
||||
const strategyName = 'archive-search';
|
||||
|
||||
const strategyPromise = (async () => {
|
||||
try {
|
||||
console.log('Searching for archived or cached content...');
|
||||
const archiveQuery = blockedDomain
|
||||
? `${options.query} web archive OR wayback machine OR cached version "site:${blockedDomain}"`
|
||||
: `${options.query} archived OR cached OR mirror`;
|
||||
const modifiedParams = { ...options, query: archiveQuery, max_results: Math.min(options.max_results || 10, 5) };
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 4000));
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
|
||||
return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime };
|
||||
} catch (error) {
|
||||
return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime };
|
||||
}
|
||||
})();
|
||||
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve({
|
||||
success: false,
|
||||
error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`,
|
||||
strategy: strategyName,
|
||||
responseTime: Date.now() - startTime
|
||||
}), RECOVERY_TIMEOUT_MS);
|
||||
});
|
||||
|
||||
return Promise.race([strategyPromise, timeoutPromise]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles connection refused errors
|
||||
* @param {Object} error - The connection error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handleConnectionRefusedError(error, options) {
|
||||
console.log('Handling connection refused error...');
|
||||
|
||||
try {
|
||||
// Sometimes waiting and retrying works
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
|
||||
// Try with different parameters
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
headers: generateDiverseHeaders(),
|
||||
timeout: (options.timeout || 10000) + 5000 // Increase timeout
|
||||
};
|
||||
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results after handling connection refused error'
|
||||
};
|
||||
} catch (retryError) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to retrieve results after handling connection refused error: ${retryError.message}`
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles timeout errors
|
||||
* @param {Object} error - The timeout error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handleTimeoutError(error, options) {
|
||||
console.log('Handling timeout error...');
|
||||
|
||||
try {
|
||||
// Retry with increased timeout and different headers
|
||||
const modifiedParams = {
|
||||
...options,
|
||||
headers: generateDiverseHeaders(),
|
||||
timeout: Math.min((options.timeout || 10000) * 2, 30000) // Double timeout, max 30s
|
||||
};
|
||||
|
||||
const results = await contentExtractor.tavily.search(modifiedParams);
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results after handling timeout error'
|
||||
};
|
||||
} catch (retryError) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to retrieve results after handling timeout error: ${retryError.message}`
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate diverse headers to avoid detection
|
||||
* @returns {Object} Diverse headers object
|
||||
*/
|
||||
function generateDiverseHeaders() {
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
];
|
||||
|
||||
const acceptLanguages = [
|
||||
'en-US,en;q=0.9',
|
||||
'en-GB,en;q=0.9',
|
||||
'en-CA,en;q=0.9',
|
||||
'en-AU,en;q=0.9'
|
||||
];
|
||||
|
||||
return {
|
||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': acceptLanguages[Math.floor(Math.random() * acceptLanguages.length)],
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0'
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects if error is a 422 schema validation error
|
||||
* @param {Object} error - The error object
|
||||
* @returns {boolean} True if this is a 422 schema error
|
||||
*/
|
||||
function is422SchemaError(error) {
|
||||
const errorMessage = error.message || '';
|
||||
const errorString = JSON.stringify(error);
|
||||
|
||||
// Check for common 422 schema validation patterns
|
||||
const schemaErrorPatterns = [
|
||||
'missing',
|
||||
'input_schema',
|
||||
'Field required',
|
||||
'unprocessable entity',
|
||||
'validation error',
|
||||
'schema validation',
|
||||
'invalid request format'
|
||||
];
|
||||
|
||||
return schemaErrorPatterns.some(pattern =>
|
||||
errorMessage.toLowerCase().includes(pattern) ||
|
||||
errorString.toLowerCase().includes(pattern)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles 422 Unprocessable Entity errors (schema validation)
|
||||
* @param {Object} error - The 422 error
|
||||
* @param {Object} options - Search options
|
||||
* @returns {Object} Recovery results
|
||||
*/
|
||||
async function handle422Error(error, options) {
|
||||
console.log('Handling 422 schema validation error...');
|
||||
|
||||
// Try multiple recovery strategies
|
||||
const strategies = [
|
||||
() => repairSchemaAndRetry(options),
|
||||
() => simplifyQueryAndRetry(options),
|
||||
() => reformulateQueryForSchema(options),
|
||||
() => tryAlternativeAPIFormat(options)
|
||||
];
|
||||
|
||||
for (const strategy of strategies) {
|
||||
try {
|
||||
console.log('Attempting 422 error recovery strategy...');
|
||||
const results = await strategy();
|
||||
if (results && !results.error) {
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
message: 'Successfully retrieved results after handling 422 schema error'
|
||||
};
|
||||
}
|
||||
} catch (strategyError) {
|
||||
console.log('422 recovery strategy failed:', strategyError.message);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to retrieve results after handling 422 schema error: ${error.message}`
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to repair schema issues and retry
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function repairSchemaAndRetry(options) {
|
||||
console.log('Attempting schema repair...');
|
||||
|
||||
// Add missing input_schema if this is the issue
|
||||
const repairedParams = {
|
||||
...options,
|
||||
input_schema: {
|
||||
type: "web_search_20250305",
|
||||
name: "web_search",
|
||||
max_uses: 8
|
||||
}
|
||||
};
|
||||
|
||||
// Add delay before retry
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
|
||||
return await contentExtractor.tavily.search(repairedParams);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplifies the query to avoid schema validation issues
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function simplifyQueryAndRetry(options) {
|
||||
console.log('Simplifying query for schema compatibility...');
|
||||
|
||||
const simplifiedQuery = simplifyQueryForSchema(options.query);
|
||||
const simplifiedParams = {
|
||||
...options,
|
||||
query: simplifiedQuery,
|
||||
max_results: Math.min(options.max_results || 10, 5), // Reduce complexity
|
||||
search_depth: "basic" // Use simpler search mode
|
||||
};
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 1500));
|
||||
|
||||
return await contentExtractor.tavily.search(simplifiedParams);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reformulates query specifically for schema issues
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function reformulateQueryForSchema(options) {
|
||||
console.log('Reformulating query for schema compatibility...');
|
||||
|
||||
const reformulatedQuery = reformulateQueryForSchemaCompatibility(options.query);
|
||||
const reformulatedParams = {
|
||||
...options,
|
||||
query: reformulatedQuery,
|
||||
include_answer: false, // Simplify request
|
||||
include_raw_content: false
|
||||
};
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
return await contentExtractor.tavily.search(reformulatedParams);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries alternative API format
|
||||
* @param {Object} options - Original search options
|
||||
* @returns {Object} Search results
|
||||
*/
|
||||
async function tryAlternativeAPIFormat(options) {
|
||||
console.log('Trying alternative API format...');
|
||||
|
||||
// Try with minimal parameters
|
||||
const minimalParams = {
|
||||
query: options.query,
|
||||
api_key: options.api_key,
|
||||
search_depth: "basic"
|
||||
};
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 3000));
|
||||
|
||||
return await contentExtractor.tavily.search(minimalParams);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplifies query for schema compatibility
|
||||
* @param {string} query - Original query
|
||||
* @returns {string} Simplified query
|
||||
*/
|
||||
function simplifyQueryForSchema(query) {
|
||||
return query
|
||||
.replace(/\s+/g, ' ') // Normalize whitespace
|
||||
.replace(/[^\w\s\-.,!?]/g, '') // Remove special characters except basic punctuation
|
||||
.substring(0, 200) // Limit length
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reformulates query specifically for schema compatibility issues
|
||||
* @param {string} query - Original query
|
||||
* @returns {string} Reformulated query
|
||||
*/
|
||||
function reformulateQueryForSchemaCompatibility(query) {
|
||||
// Break down complex queries into simpler components
|
||||
const words = query.split(' ').filter(word => word.length > 2);
|
||||
if (words.length > 8) {
|
||||
// If query is too long, use the most important terms
|
||||
return words.slice(0, 6).join(' ');
|
||||
}
|
||||
|
||||
// Replace problematic patterns
|
||||
return query
|
||||
.replace(/\d{4}/g, '') // Remove years
|
||||
.replace(/github|gitlab|bitbucket/gi, 'code repository') // Replace specific platforms
|
||||
.replace(/open source|open-source/gi, 'free software') // Simplify terminology
|
||||
.replace(/platform|boilerplate|framework/gi, 'software') // Generic terms
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reformulates a query to potentially bypass filters
|
||||
* @param {string} query - Original query
|
||||
* @returns {string} Reformulated query
|
||||
*/
|
||||
function reformulateQuery(query) {
|
||||
// Simple reformulation - could be enhanced with more sophisticated NLP
|
||||
const synonyms = {
|
||||
'how to': 'guide for',
|
||||
'what is': 'information about',
|
||||
'why is': 'reason for',
|
||||
'when did': 'date of'
|
||||
};
|
||||
|
||||
let reformulated = query;
|
||||
for (const [original, replacement] of Object.entries(synonyms)) {
|
||||
reformulated = reformulated.replace(new RegExp(original, 'gi'), replacement);
|
||||
}
|
||||
|
||||
return reformulated;
|
||||
}
|
||||
|
||||
// Export additional functions for testing
|
||||
export {
|
||||
classify451Failure,
|
||||
validateRecoveryTimeout,
|
||||
createStandardErrorResponse,
|
||||
createStandardSuccessResponse
|
||||
};
|
||||
458
hooks/handle-web-search.mjs
Normal file
458
hooks/handle-web-search.mjs
Normal file
@@ -0,0 +1,458 @@
|
||||
// hooks/handle-web-search.mjs
|
||||
import { tavily, extractContent } from './content-extractor.mjs';
|
||||
import { handleWebSearchError } from './handle-search-error.mjs';
|
||||
|
||||
// Configuration for environment variable namespacing
|
||||
const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null;
|
||||
const JINAAI_API_KEY = process.env.SEARCH_PLUS_JINAAI_API_KEY || process.env.JINAAI_API_KEY || null;
|
||||
|
||||
// Show deprecation warnings for old variable names
|
||||
if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) {
|
||||
console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY');
|
||||
}
|
||||
if (!process.env.SEARCH_PLUS_JINAAI_API_KEY && process.env.JINAAI_API_KEY) {
|
||||
console.warn('⚠️ JINAAI_API_KEY is deprecated. Please update to SEARCH_PLUS_JINAAI_API_KEY');
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects if the input is a URL
|
||||
* @param {string} input - The input to check
|
||||
* @returns {boolean} True if the input is a URL
|
||||
*/
|
||||
function isURL(input) {
|
||||
try {
|
||||
const url = new URL(input);
|
||||
return url.protocol === 'http:' || url.protocol === 'https:';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles web search requests with enhanced error handling
|
||||
* @param {Object} params - Search parameters
|
||||
* @returns {Object} Search results or error information
|
||||
*/
|
||||
export async function handleWebSearch(params) {
|
||||
const query = params.query || params.q || '';
|
||||
const maxRetries = params.maxRetries || 3;
|
||||
const timeout = params.timeout || 10000; // 10 seconds default
|
||||
|
||||
if (!query) {
|
||||
return {
|
||||
error: true,
|
||||
message: 'No search query or URL provided'
|
||||
};
|
||||
}
|
||||
|
||||
// Check if the query is a URL and handle extraction
|
||||
if (isURL(query)) {
|
||||
console.log(`🔍 Extracting content from URL: ${query}`);
|
||||
const result = await handleURLExtraction(query, { maxRetries, timeout });
|
||||
|
||||
// Provide brief status feedback
|
||||
if (result.success) {
|
||||
console.log(`✅ URL extraction completed successfully`);
|
||||
} else {
|
||||
console.log(`❌ URL extraction failed: ${result.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Provide status feedback for search queries
|
||||
if (!isURL(query)) {
|
||||
console.log(`🔍 Searching: ${query}`);
|
||||
}
|
||||
|
||||
// Use hybrid search strategy
|
||||
try {
|
||||
const searchParams = {
|
||||
query,
|
||||
maxResults: params.maxResults || 5,
|
||||
includeAnswer: params.includeAnswer !== false,
|
||||
includeRawContent: params.includeRawContent || false,
|
||||
headers: generateRandomHeaders()
|
||||
};
|
||||
|
||||
const result = await performHybridSearch(searchParams, timeout);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: result.data,
|
||||
service: result.service,
|
||||
attempt: 1
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('All search strategies failed:', error.message);
|
||||
|
||||
// Final error handling for recovery attempts
|
||||
const errorResult = await handleWebSearchError(error, {
|
||||
query,
|
||||
maxResults: params.maxResults || 5,
|
||||
includeAnswer: params.includeAnswer || true,
|
||||
includeRawContent: params.includeRawContent || false,
|
||||
headers: generateRandomHeaders(),
|
||||
timeout,
|
||||
attempt: 1,
|
||||
error: error
|
||||
});
|
||||
|
||||
if (errorResult && errorResult.success) {
|
||||
return {
|
||||
success: true,
|
||||
data: errorResult.data,
|
||||
attempt: 1,
|
||||
errorRecovered: true,
|
||||
originalError: error.message,
|
||||
recoveryMessage: errorResult.message
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
error: true,
|
||||
message: errorResult?.message || error.message,
|
||||
attempt: 1,
|
||||
errorHandlingApplied: true
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hybrid web search with intelligent service selection
|
||||
* Sequential: Tavily → Parallel free services
|
||||
* Note: Jina API is only used for URL extraction, not web search
|
||||
*/
|
||||
async function performHybridSearch(params, timeoutMs = 10000) {
|
||||
// Phase 1: Try Tavily API (premium service)
|
||||
if (TAVILY_API_KEY) {
|
||||
try {
|
||||
console.log('🚀 Trying Tavily API...');
|
||||
const result = await tavily.search(params, timeoutMs);
|
||||
return { data: result, service: 'tavily' };
|
||||
} catch (error) {
|
||||
console.log('🔄 Tavily failed, trying free services...');
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Parallel execution for free services
|
||||
console.log('🌐 Trying all free search engines in parallel...');
|
||||
const freeStrategies = [
|
||||
trySearXNGSearch(params, timeoutMs),
|
||||
tryDuckDuckGoHTML(params, timeoutMs),
|
||||
tryStartpageHTML(params, timeoutMs)
|
||||
];
|
||||
|
||||
try {
|
||||
const result = await Promise.any(freeStrategies);
|
||||
console.log(`✅ Success with free service: ${result.service}`);
|
||||
return result;
|
||||
} catch (aggregateError) {
|
||||
throw new Error('All search services failed. Try again or configure Tavily API key for enhanced reliability.');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Attempts search using SearXNG metasearch engine
|
||||
*/
|
||||
async function trySearXNGSearch(params, timeoutMs = 10000) {
|
||||
const searxngInstances = [
|
||||
'https://search.brave.works',
|
||||
'https://searx.be',
|
||||
'https://searx.tiekoetter.com',
|
||||
'https://search.snopyta.org'
|
||||
];
|
||||
|
||||
const query = encodeURIComponent(params.query);
|
||||
const maxResults = params.maxResults || 5;
|
||||
|
||||
for (const instance of searxngInstances) {
|
||||
try {
|
||||
const searchUrl = `${instance}/search?q=${query}&format=json&engines=google,duckduckgo,startpage&results=${maxResults}`;
|
||||
|
||||
const response = await fetch(searchUrl, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': instance,
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
...params.headers
|
||||
},
|
||||
signal: AbortSignal.timeout(timeoutMs)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
continue; // Try next instance
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!data.results || data.results.length === 0) {
|
||||
continue; // Try next instance
|
||||
}
|
||||
|
||||
// Transform SearXNG results to Tavily-like format
|
||||
const transformedResults = {
|
||||
results: data.results.slice(0, maxResults).map((item, index) => ({
|
||||
title: item.title,
|
||||
url: item.url,
|
||||
content: item.content || '',
|
||||
score: 1.0 - (index * 0.1), // Simple scoring
|
||||
published_date: item.publishedDate || null
|
||||
})),
|
||||
answer: data.answers?.[0] || null,
|
||||
query: params.query,
|
||||
response_time: Date.now() - performance.now()
|
||||
};
|
||||
|
||||
return { data: transformedResults, service: 'searxng' };
|
||||
|
||||
} catch (error) {
|
||||
console.log(`❌ SearXNG instance ${instance} failed: ${error.message}`);
|
||||
continue; // Try next instance
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('All SearXNG instances failed');
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts search using DuckDuckGo HTML parsing
|
||||
*/
|
||||
async function tryDuckDuckGoHTML(params, timeoutMs = 10000) {
|
||||
const query = encodeURIComponent(params.query);
|
||||
const maxResults = params.maxResults || 5;
|
||||
|
||||
const searchUrl = `https://html.duckduckgo.com/html/?q=${query}&kl=us-en`;
|
||||
|
||||
const response = await fetch(searchUrl, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0',
|
||||
...params.headers
|
||||
},
|
||||
signal: AbortSignal.timeout(timeoutMs)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`DuckDuckGo HTML error: ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Parse HTML results
|
||||
const results = [];
|
||||
const resultRegex = /<div class="result">[\s\S]*?<a rel="nofollow" class="result__a" href="([^"]+)">([^<]+)<\/a>[\s\S]*?<a class="result__snippet" href="[^"]*">([^<]*)<\/a>/g;
|
||||
|
||||
let match;
|
||||
while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
|
||||
const [, url, title, snippet] = match;
|
||||
|
||||
if (url && title && !url.includes('//r.jina.ai/http')) { // Filter out redirect links
|
||||
results.push({
|
||||
title: title.trim(),
|
||||
url: url.startsWith('http') ? url : `https:${url}`,
|
||||
content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
|
||||
score: 1.0 - (results.length * 0.1)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length === 0) {
|
||||
throw new Error('No results found in DuckDuckGo HTML response');
|
||||
}
|
||||
|
||||
const transformedResults = {
|
||||
results,
|
||||
answer: null, // DuckDuckGo doesn't provide instant answers in HTML mode
|
||||
query: params.query,
|
||||
response_time: Date.now() - performance.now()
|
||||
};
|
||||
|
||||
return { data: transformedResults, service: 'duckduckgo-html' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts search using Startpage HTML parsing
|
||||
*/
|
||||
async function tryStartpageHTML(params, timeoutMs = 10000) {
|
||||
const query = encodeURIComponent(params.query);
|
||||
const maxResults = params.maxResults || 5;
|
||||
|
||||
const searchUrl = `https://www.startpage.com/do/search?query=${query}&cat=web&pl=ext-ff&extVersion=1.3.0`;
|
||||
|
||||
const response = await fetch(searchUrl, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0',
|
||||
...params.headers
|
||||
},
|
||||
signal: AbortSignal.timeout(timeoutMs)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Startpage HTML error: ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Parse HTML results (Startpage format)
|
||||
const results = [];
|
||||
const resultRegex = /<h3><a href="([^"]+)"[^>]*>([^<]+)<\/a><\/h3>[\s\S]*?<p class="snippet">([^<]*)<\/p>/g;
|
||||
|
||||
let match;
|
||||
while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
|
||||
const [, url, title, snippet] = match;
|
||||
|
||||
if (url && title) {
|
||||
results.push({
|
||||
title: title.trim(),
|
||||
url: url.startsWith('http') ? url : `https:${url}`,
|
||||
content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
|
||||
score: 1.0 - (results.length * 0.1)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length === 0) {
|
||||
throw new Error('No results found in Startpage HTML response');
|
||||
}
|
||||
|
||||
const transformedResults = {
|
||||
results,
|
||||
answer: null,
|
||||
query: params.query,
|
||||
response_time: Date.now() - performance.now()
|
||||
};
|
||||
|
||||
return { data: transformedResults, service: 'startpage-html' };
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate random headers to avoid detection
|
||||
* @returns {Object} Random headers object
|
||||
*/
|
||||
function generateRandomHeaders() {
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
|
||||
];
|
||||
|
||||
return {
|
||||
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if an error is retryable
|
||||
* @param {Error} error - The error to check
|
||||
* @returns {boolean} True if the error is retryable
|
||||
*/
|
||||
function isRetryableError(error) {
|
||||
// 403, 422, 429, 451, ECONNREFUSED, ETIMEDOUT are retryable
|
||||
const errorMessage = error.message || '';
|
||||
const errorString = JSON.stringify(error);
|
||||
|
||||
return error.code === 403 ||
|
||||
error.code === 422 ||
|
||||
error.code === 429 ||
|
||||
error.code === 451 ||
|
||||
error.code === 'ECONNREFUSED' ||
|
||||
error.code === 'ETIMEDOUT' ||
|
||||
errorMessage.includes('403') ||
|
||||
errorMessage.includes('422') ||
|
||||
errorMessage.includes('429') ||
|
||||
errorMessage.includes('451') ||
|
||||
errorMessage.includes('SecurityCompromiseError') ||
|
||||
errorMessage.includes('blocked until') ||
|
||||
errorMessage.includes('ECONNREFUSED') ||
|
||||
errorMessage.includes('ETIMEDOUT') ||
|
||||
// Check for schema validation patterns
|
||||
errorString.toLowerCase().includes('missing') ||
|
||||
errorString.toLowerCase().includes('input_schema') ||
|
||||
errorString.toLowerCase().includes('field required');
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles URL extraction with retry logic
|
||||
* @param {string} url - The URL to extract content from
|
||||
* @param {Object} options - Extraction options
|
||||
* @returns {Object} Extraction results or error information
|
||||
*/
|
||||
async function handleURLExtraction(url, options = {}) {
|
||||
const { maxRetries = 3, timeout = 15000 } = options;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
// Add random delay to avoid rate limiting
|
||||
if (attempt > 0) {
|
||||
const delay = Math.min(1000 * Math.pow(2, attempt), 8000); // Exponential backoff up to 8s
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
}
|
||||
|
||||
// Try to extract content with custom headers
|
||||
const extractOptions = {
|
||||
headers: generateRandomHeaders(),
|
||||
includeImages: false, // Don't include images by default for faster processing
|
||||
...options
|
||||
};
|
||||
|
||||
const results = await extractContent(url, extractOptions);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: results,
|
||||
attempt: attempt + 1,
|
||||
isURLExtraction: true
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error(`URL extraction attempt ${attempt + 1} failed:`, error.message);
|
||||
|
||||
// Check if it's a retryable error
|
||||
if (attempt === maxRetries || !isRetryableError(error)) {
|
||||
return {
|
||||
error: true,
|
||||
message: `Failed to extract content from URL: ${error.message}`,
|
||||
attempt: attempt + 1,
|
||||
isURLExtraction: true
|
||||
};
|
||||
}
|
||||
|
||||
// Continue to next attempt
|
||||
}
|
||||
}
|
||||
}
|
||||
16
hooks/hooks.json
Normal file
16
hooks/hooks.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"hooks": {
|
||||
"PostToolUse": [
|
||||
{
|
||||
"matcher": "WebSearch|WebFetch",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "node ${CLAUDE_PLUGIN_ROOT}/hooks/handle-web-search.mjs",
|
||||
"timeout": 30
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user