Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:57:03 +08:00
commit 590188e792
11 changed files with 3511 additions and 0 deletions

1706
hooks/content-extractor.mjs Normal file

File diff suppressed because it is too large Load Diff

117
hooks/handle-rate-limit.mjs Normal file
View File

@@ -0,0 +1,117 @@
// hooks/handle-rate-limit.mjs
import contentExtractor from './content-extractor.mjs';
/**
* Handles rate limiting scenarios
* @param {Object} error - The rate limit error
* @param {Object} options - Search options
* @returns {Object} Results after handling rate limit
*/
export async function handleRateLimit(error, options) {
console.log('Handling rate limit error...');
// Extract retry-after header if available, or use default delay
let delay = 60000; // Default to 1 minute
if (error.response && error.response.headers && error.response.headers['retry-after']) {
const retryAfter = parseInt(error.response.headers['retry-after'], 10);
if (!isNaN(retryAfter)) {
delay = retryAfter * 1000; // Convert to milliseconds
}
}
// Apply jitter to avoid thundering herd problem
const jitter = Math.random() * 5000; // Up to 5 seconds
delay += jitter;
console.log(`Waiting ${Math.round(delay/1000)} seconds before retrying due to rate limiting...`);
try {
// Wait for the required time
await new Promise(resolve => setTimeout(resolve, delay));
// Try again with modified parameters to reduce load
const modifiedParams = {
...options,
headers: generateRateLimitHeaders(),
maxResults: Math.max(1, Math.floor((options.maxResults || 5) / 2)) // Reduce number of results
};
const results = await contentExtractor.tavily.search(modifiedParams);
return {
success: true,
data: results,
message: 'Successfully retrieved results after handling rate limit'
};
} catch (retryError) {
// If still rate limited, try with even more conservative parameters
try {
// Wait an additional time
await new Promise(resolve => setTimeout(resolve, 120000)); // 2 minutes
const conservativeParams = {
...options,
headers: generateVeryConservativeHeaders(),
maxResults: 1, // Get just one result
query: simplifyQuery(options.query)
};
const results = await contentExtractor.tavily.search(conservativeParams);
return {
success: true,
data: results,
message: 'Successfully retrieved results with conservative approach after rate limiting'
};
} catch (finalError) {
return {
error: true,
message: `Rate limit handling failed after multiple attempts: ${finalError.message}`
};
}
}
}
/**
* Generate headers that are less likely to trigger rate limits
* @returns {Object} Conservative headers
*/
function generateRateLimitHeaders() {
return {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'DNT': '1'
};
}
/**
* Generate very conservative headers
* @returns {Object} Very conservative headers
*/
function generateVeryConservativeHeaders() {
return {
'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBot/1.0; +http://archive.org/details/archivebot)',
'Accept': 'text/html',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'identity', // Don't request compression to reduce processing load
'Connection': 'close',
'Cache-Control': 'max-age=0'
};
}
/**
* Simplifies a query to reduce complexity
* @param {string} query - Original query
* @returns {string} Simplified query
*/
function simplifyQuery(query) {
// Remove complex terms that might trigger more intensive processing
return query
.replace(/\b(how to|guide to|tutorial for)\b/gi, '')
.replace(/\b(detailed|comprehensive|complete)\b/gi, '')
.trim();
}

View File

@@ -0,0 +1,887 @@
// hooks/handle-search-error.mjs
import contentExtractor from './content-extractor.mjs';
import { handleRateLimit } from './handle-rate-limit.mjs';
// ============================================================================
// CONFIGURATION: Recovery strategy timeout
// ============================================================================
/**
* Recovery strategy timeout in milliseconds
* Environment variable: SEARCH_PLUS_RECOVERY_TIMEOUT_MS
* Default: 5000ms (5 seconds) - based on project requirements for <5s average recovery
*/
const RECOVERY_TIMEOUT_MS = validateRecoveryTimeout(process.env.SEARCH_PLUS_RECOVERY_TIMEOUT_MS || '5000');
/**
* Validates recovery timeout configuration value
* @param {string} value - The timeout value to validate
* @returns {number} Validated timeout in milliseconds
*/
function validateRecoveryTimeout(value) {
const parsed = parseInt(value, 10);
// Check if value is a valid number
if (isNaN(parsed)) {
console.warn(`⚠️ Invalid SEARCH_PLUS_RECOVERY_TIMEOUT_MS: "${value}". Using default 5000ms.`);
return 5000;
}
// Check for reasonable bounds (100ms to 60s)
if (parsed < 100) {
console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too low: ${parsed}ms. Minimum is 100ms. Using 100ms.`);
return 100;
}
if (parsed > 60000) {
console.warn(`⚠️ SEARCH_PLUS_RECOVERY_TIMEOUT_MS too high: ${parsed}ms. Maximum is 60000ms. Using 60000ms.`);
return 60000;
}
return parsed;
}
// Log configuration in development mode
if (process.env.NODE_ENV === 'development') {
console.log(`🔧 Search-Plus Recovery Timeout: ${RECOVERY_TIMEOUT_MS}ms`);
}
/**
* Standardized error response helper
* @param {string} strategy - Name of the strategy that failed
* @param {Error|string} error - The error that occurred
* @param {number} startTime - Strategy start timestamp
* @param {Object} additionalInfo - Additional context info
* @returns {Object} Standardized error response
*/
function createStandardErrorResponse(strategy, error, startTime, additionalInfo = {}) {
const responseTime = Date.now() - startTime;
const errorMessage = error instanceof Error ? error.message : error;
return {
success: false,
error: errorMessage,
strategy: strategy,
responseTime: responseTime,
timestamp: new Date().toISOString(),
...additionalInfo
};
}
/**
* Standardized success response helper
* @param {string} strategy - Name of the strategy that succeeded
* @param {*} data - The data returned by the strategy
* @param {number} startTime - Strategy start timestamp
* @param {Object} additionalInfo - Additional context info
* @returns {Object} Standardized success response
*/
function createStandardSuccessResponse(strategy, data, startTime, additionalInfo = {}) {
const responseTime = Date.now() - startTime;
return {
success: true,
data: data,
strategy: strategy,
responseTime: responseTime,
timestamp: new Date().toISOString(),
...additionalInfo
};
}
/**
* Handles web search errors with advanced recovery strategies
* @param {Object} error - The error object
* @param {Object} options - Search options that caused the error
* @returns {Object} Recovery results or final error
*/
export async function handleWebSearchError(error, options) {
console.log('Handling search error:', error);
// Check error type and apply appropriate recovery strategy
if (error.code === 403 || error.message.includes('403') || error.message.toLowerCase().includes('forbidden')) {
return await handle403Error(error, options);
}
else if (error.code === 451 || error.message.includes('451') || error.message.toLowerCase().includes('securitycompromise') || error.message.toLowerCase().includes('blocked until')) {
return await handle451SecurityError(error, options);
}
else if (error.code === 422 || error.message.includes('422') || is422SchemaError(error)) {
return await handle422Error(error, options);
}
else if (error.code === 429 || error.message.includes('429') || error.message.toLowerCase().includes('rate limit')) {
return await handleRateLimit(error, options);
}
else if (error.code === 'ECONNREFUSED' || error.message.toLowerCase().includes('connection refused')) {
return await handleConnectionRefusedError(error, options);
}
else if (error.code === 'ETIMEDOUT' || error.message.toLowerCase().includes('timeout')) {
return await handleTimeoutError(error, options);
}
else {
// For other errors, return the original error
return {
error: true,
message: `Search failed: ${error.message}`,
code: error.code
};
}
}
/**
* Handles 403 Forbidden errors
* @param {Object} error - The 403 error
* @param {Object} options - Search options
* @returns {Object} Recovery results
*/
async function handle403Error(error, options) {
console.log('Handling 403 error - trying with different headers...');
try {
// Try again with completely different headers
const modifiedParams = {
...options,
headers: generateDiverseHeaders()
};
// Add a delay before retrying
await new Promise(resolve => setTimeout(resolve, 2000));
const results = await contentExtractor.tavily.search(modifiedParams);
return {
success: true,
data: results,
message: 'Successfully retrieved results after handling 403 error'
};
} catch (retryError) {
console.log('403 retry failed, trying alternative approach...');
// Try with a different search query formulation
try {
const reformulatedQuery = reformulateQuery(options.query);
const results = await contentExtractor.tavily.search({ ...options, query: reformulatedQuery });
return {
success: true,
data: results,
message: 'Successfully retrieved results with reformulated query after 403 error'
};
} catch (finalError) {
return {
error: true,
message: `Failed to retrieve results after handling 403 error: ${finalError.message}`
};
}
}
}
/**
* Handles 451 SecurityCompromiseError (domain blocked due to abuse)
* Uses parallel execution with enhanced UX logging
* @param {Object} error - The 451 error
* @param {Object} options - Search options
* @returns {Object} Recovery results
*/
async function handle451SecurityError(error, options) {
const blockedDomain = extractBlockedDomain(error.message);
// Simple mode for power users who want minimal output
if (process.env.SEARCH_PLUS_451_SIMPLE_MODE === 'true') {
return await handleSimple451Recovery(error, options, blockedDomain);
}
// Enhanced UX logging by default
console.log('🚫 451 SecurityCompromiseError detected');
console.log(`📍 Blocked domain: ${blockedDomain || 'unknown'}`);
console.log('🚀 Starting parallel recovery:');
console.log(' 🛡️ Strategy 1: Domain exclusion');
console.log(' 🔍 Strategy 2: Alternative sources');
// Optimized parallel execution using the two most effective strategies
const strategies = [
searchWithExcludedDomainUnified(options, blockedDomain, true),
tryAlternativeSearchSources(options, true)
];
try {
const results = await Promise.any(strategies);
console.log(`✅ Success! Used strategy: ${results.strategy} (${results.responseTime}ms)`);
// Provide actionable suggestions for future searches
if (blockedDomain) {
console.log(`💡 Next time, try: /search-plus "${options.query} -site:${blockedDomain}"`);
}
return {
success: true,
data: results.data,
message: `Successfully retrieved results using ${results.strategy} for blocked domain ${blockedDomain || 'unknown'}`,
strategy: results.strategy,
responseTime: results.responseTime,
blockedDomain: blockedDomain
};
} catch (aggregateError) {
// Enhanced error classification and user guidance
const failureType = classify451Failure(aggregateError, blockedDomain, options);
console.log(`❌ All recovery strategies failed`);
console.log(`🔍 Error type: ${failureType.type}`);
if (failureType.suggestions.length > 0) {
console.log('💡 Suggestions:');
failureType.suggestions.forEach((suggestion, i) => {
console.log(` ${i + 1}. ${suggestion.description}`);
});
}
return generateEnhancedErrorResponse(failureType, blockedDomain, options);
}
}
/**
* Handles 451 errors in simple mode with minimal output
* @param {Object} error - The 451 error
* @param {Object} options - Search options
* @param {string} blockedDomain - The blocked domain
* @returns {Object} Recovery results
*/
async function handleSimple451Recovery(error, options, blockedDomain) {
console.log('⚡ 451 error - attempting recovery...');
const strategies = [
searchWithExcludedDomainUnified(options, blockedDomain, true),
tryAlternativeSearchSources(options, true)
];
try {
const results = await Promise.any(strategies);
console.log(`⚡ 451 recovered in ${results.responseTime}ms`);
return results;
} catch (aggregateError) {
console.log('❌ 451 recovery failed');
return {
error: true,
message: `Failed to recover from 451 error. Domain ${blockedDomain || 'unknown'} is blocked.`,
blockedDomain: blockedDomain
};
}
}
/**
* Classifies 451 failure types for enhanced error handling
* @param {AggregateError} aggregateError - The combined error from failed strategies
* @param {string} blockedDomain - The blocked domain
* @param {Object} options - Original search options
* @returns {Object} Failure classification with suggestions
*/
function classify451Failure(aggregateError, blockedDomain, options) {
// Check for permanent block patterns
if (aggregateError.errors.some(err => err.message.includes('blocked until'))) {
return {
type: 'permanent-block',
suggestions: [
{
type: 'ready-to-run',
command: `/search-plus "${options.query} -site:${blockedDomain}"`,
description: 'Exclude blocked domain and search again'
},
{
type: 'manual-search',
url: `https://www.google.com/search?q=${encodeURIComponent(options.query)}`,
description: 'Search manually in external browser'
}
],
autoSuggestion: {
message: 'For more predictable results, enable simple 451 handling?',
command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true',
benefit: 'Provides clear guidance instead of complex automation'
}
};
}
// Default classification
return {
type: 'recovery-failed',
suggestions: [
{
type: 'ready-to-run',
command: `/search-plus "${options.query} -site:${blockedDomain}"`,
description: 'Try again excluding the blocked domain'
}
],
autoSuggestion: {
message: 'Want simpler error handling?',
command: 'export SEARCH_PLUS_451_SIMPLE_MODE=true',
benefit: 'Minimal output with focus on results'
}
};
}
/**
* Generates enhanced error response with actionable suggestions
* @param {Object} failureType - The classified failure type
* @param {string} blockedDomain - The blocked domain
* @param {Object} options - Original search options
* @returns {Object} Enhanced error response
*/
function generateEnhancedErrorResponse(failureType, blockedDomain, options) {
return {
error: true,
message: `Failed to retrieve results after handling 451 SecurityCompromiseError. Domain ${blockedDomain || 'unknown'} is blocked.`,
blockedDomain: blockedDomain,
failureType: failureType.type,
suggestions: failureType.suggestions,
autoSuggestion: failureType.autoSuggestion
};
}
/**
* Extracts the blocked domain from error message
* @param {string} errorMessage - The error message
* @returns {string|null} The blocked domain or null if not found
*/
function extractBlockedDomain(errorMessage) {
const domainMatch = errorMessage.match(/domain (\S+) blocked/i) ||
errorMessage.match(/access to (\S+) blocked/i);
return domainMatch ? domainMatch[1] : null;
}
/**
* Extracts the block expiration date from error message
* @param {string} errorMessage - The error message
* @returns {string|null} The block expiration date or null if not found
*/
function extractBlockUntilDate(errorMessage) {
// Look for "blocked until" followed by a date, capturing until the next reason or end
const dateMatch = errorMessage.match(/blocked until (.+?)(?:\s+due|$)/i);
return dateMatch ? dateMatch[1].trim() : null;
}
/**
* Alternative search sources with configurable optimization level
* @param {Object} options - Original search options
* @param {boolean} optimized - Whether to use optimized timeouts for parallel execution
* @returns {Promise<Object>} Search results from alternative sources
*/
async function tryAlternativeSearchSources(options, optimized = false) {
const startTime = Date.now();
const strategyName = 'alternative-search-sources';
const timeout = optimized ? 1500 : RECOVERY_TIMEOUT_MS;
try {
console.log(optimized ? '🔍 Trying alternative search sources...' : 'Trying alternative search sources...');
const blockedDomain = optimized ? (options.blockedDomain || null) : (options.error ? extractBlockedDomain(options.error.message || '') : null);
const domainFilter = blockedDomain ? `-site:${blockedDomain}` : '';
const modifiedQuery = `${options.query} ${domainFilter} alternative OR substitute OR replacement`.trim();
const modifiedParams = {
...options,
query: modifiedQuery,
include_answer: true,
max_results: Math.min(options.max_results || 10, 8)
};
if (optimized) {
// Create AbortController for proper timeout cleanup in optimized mode
const abortController = new AbortController();
const timeoutId = setTimeout(() => {
abortController.abort();
}, timeout);
try {
const searchPromise = contentExtractor.tavily.search({
...modifiedParams,
signal: abortController.signal
});
const results = await searchPromise;
clearTimeout(timeoutId);
return createStandardSuccessResponse(strategyName, results, startTime);
} catch (searchError) {
if (searchError.name === 'AbortError') {
throw new Error('Strategy timeout');
}
throw searchError;
}
} else {
// Standard mode with timeout promise
const strategyPromise = contentExtractor.tavily.search(modifiedParams);
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout);
});
return await Promise.race([strategyPromise, timeoutPromise]);
}
} catch (error) {
return createStandardErrorResponse(strategyName, error, startTime);
}
}
/**
* Domain exclusion search with configurable optimization level
* @param {Object} options - Original search options
* @param {string} blockedDomain - The blocked domain
* @param {boolean} optimized - Whether to use optimized timeouts for parallel execution
* @returns {Promise<Object>} Search results
*/
async function searchWithExcludedDomainUnified(options, blockedDomain, optimized = false) {
const startTime = Date.now();
const strategyName = 'excluded-domain-search';
const timeout = optimized ? 1000 : RECOVERY_TIMEOUT_MS;
try {
if (!blockedDomain) {
return createStandardErrorResponse(strategyName, 'No blocked domain to exclude', startTime);
}
console.log(optimized ? `🛡️ Excluding domain: ${blockedDomain}` : `Searching while excluding domain: ${blockedDomain}`);
const exclusionQuery = `${options.query} -site:${blockedDomain}`;
const modifiedParams = {
...options,
query: exclusionQuery,
headers: generateDiverseHeaders()
};
if (optimized) {
// Create AbortController for proper timeout cleanup in optimized mode
const abortController = new AbortController();
const timeoutId = setTimeout(() => {
abortController.abort();
}, timeout);
try {
const searchPromise = contentExtractor.tavily.search({
...modifiedParams,
signal: abortController.signal
});
const results = await searchPromise;
clearTimeout(timeoutId);
return createStandardSuccessResponse(strategyName, results, startTime);
} catch (searchError) {
if (searchError.name === 'AbortError') {
throw new Error('Strategy timeout');
}
throw searchError;
}
} else {
// Standard mode with timeout promise and delay
const strategyPromise = (async () => {
await new Promise(resolve => setTimeout(resolve, 3000));
const results = await contentExtractor.tavily.search(modifiedParams);
return createStandardSuccessResponse(strategyName, results, startTime);
})();
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => resolve(createStandardErrorResponse(strategyName, `Strategy timed out after ${timeout}ms`, startTime)), timeout);
});
return await Promise.race([strategyPromise, timeoutPromise]);
}
} catch (error) {
return createStandardErrorResponse(strategyName, error, startTime);
}
}
/**
* Reformulates query to avoid references to blocked domains
* @param {Object} options - Original search options
* @param {string} blockedDomain - The blocked domain
* @returns {Object} Search results
*/
async function reformulateQueryAvoidingBlockedDomain(options, blockedDomain) {
const startTime = Date.now();
const strategyName = 'reformulate-query';
const strategyPromise = (async () => {
try {
console.log('Reformulating query to avoid blocked domain references...');
let reformulatedQuery = options.query;
if (blockedDomain) {
const domainMappings = {
'httpbin.org': 'HTTP testing API endpoint service',
'github.com': 'code repository platform',
'stackoverflow.com': 'programming Q&A website',
'medium.com': 'blogging platform'
};
const genericTerm = domainMappings[blockedDomain] || 'online service';
reformulatedQuery = options.query.replace(new RegExp(blockedDomain, 'gi'), genericTerm);
}
const modifiedParams = { ...options, query: reformulatedQuery, search_depth: "basic" };
await new Promise(resolve => setTimeout(resolve, 2500));
const results = await contentExtractor.tavily.search(modifiedParams);
return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime };
} catch (error) {
return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime };
}
})();
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => resolve({
success: false,
error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`,
strategy: strategyName,
responseTime: Date.now() - startTime
}), RECOVERY_TIMEOUT_MS);
});
return Promise.race([strategyPromise, timeoutPromise]);
}
/**
* Attempts to use cached or archived results for blocked content
* @param {Object} options - Original search options
* @param {string} blockedDomain - The blocked domain
* @returns {Object} Search results
*/
async function useCachedOrArchiveResults(options, blockedDomain) {
const startTime = Date.now();
const strategyName = 'archive-search';
const strategyPromise = (async () => {
try {
console.log('Searching for archived or cached content...');
const archiveQuery = blockedDomain
? `${options.query} web archive OR wayback machine OR cached version "site:${blockedDomain}"`
: `${options.query} archived OR cached OR mirror`;
const modifiedParams = { ...options, query: archiveQuery, max_results: Math.min(options.max_results || 10, 5) };
await new Promise(resolve => setTimeout(resolve, 4000));
const results = await contentExtractor.tavily.search(modifiedParams);
return { success: true, data: results, strategy: strategyName, responseTime: Date.now() - startTime };
} catch (error) {
return { success: false, error: error.message, strategy: strategyName, responseTime: Date.now() - startTime };
}
})();
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => resolve({
success: false,
error: `Strategy timed out after ${RECOVERY_TIMEOUT_MS}ms`,
strategy: strategyName,
responseTime: Date.now() - startTime
}), RECOVERY_TIMEOUT_MS);
});
return Promise.race([strategyPromise, timeoutPromise]);
}
/**
* Handles connection refused errors
* @param {Object} error - The connection error
* @param {Object} options - Search options
* @returns {Object} Recovery results
*/
async function handleConnectionRefusedError(error, options) {
console.log('Handling connection refused error...');
try {
// Sometimes waiting and retrying works
await new Promise(resolve => setTimeout(resolve, 5000));
// Try with different parameters
const modifiedParams = {
...options,
headers: generateDiverseHeaders(),
timeout: (options.timeout || 10000) + 5000 // Increase timeout
};
const results = await contentExtractor.tavily.search(modifiedParams);
return {
success: true,
data: results,
message: 'Successfully retrieved results after handling connection refused error'
};
} catch (retryError) {
return {
error: true,
message: `Failed to retrieve results after handling connection refused error: ${retryError.message}`
};
}
}
/**
* Handles timeout errors
* @param {Object} error - The timeout error
* @param {Object} options - Search options
* @returns {Object} Recovery results
*/
async function handleTimeoutError(error, options) {
console.log('Handling timeout error...');
try {
// Retry with increased timeout and different headers
const modifiedParams = {
...options,
headers: generateDiverseHeaders(),
timeout: Math.min((options.timeout || 10000) * 2, 30000) // Double timeout, max 30s
};
const results = await contentExtractor.tavily.search(modifiedParams);
return {
success: true,
data: results,
message: 'Successfully retrieved results after handling timeout error'
};
} catch (retryError) {
return {
error: true,
message: `Failed to retrieve results after handling timeout error: ${retryError.message}`
};
}
}
/**
* Generate diverse headers to avoid detection
* @returns {Object} Diverse headers object
*/
function generateDiverseHeaders() {
const userAgents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
];
const acceptLanguages = [
'en-US,en;q=0.9',
'en-GB,en;q=0.9',
'en-CA,en;q=0.9',
'en-AU,en;q=0.9'
];
return {
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': acceptLanguages[Math.floor(Math.random() * acceptLanguages.length)],
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
};
}
/**
* Detects if error is a 422 schema validation error
* @param {Object} error - The error object
* @returns {boolean} True if this is a 422 schema error
*/
function is422SchemaError(error) {
const errorMessage = error.message || '';
const errorString = JSON.stringify(error);
// Check for common 422 schema validation patterns
const schemaErrorPatterns = [
'missing',
'input_schema',
'Field required',
'unprocessable entity',
'validation error',
'schema validation',
'invalid request format'
];
return schemaErrorPatterns.some(pattern =>
errorMessage.toLowerCase().includes(pattern) ||
errorString.toLowerCase().includes(pattern)
);
}
/**
* Handles 422 Unprocessable Entity errors (schema validation)
* @param {Object} error - The 422 error
* @param {Object} options - Search options
* @returns {Object} Recovery results
*/
async function handle422Error(error, options) {
console.log('Handling 422 schema validation error...');
// Try multiple recovery strategies
const strategies = [
() => repairSchemaAndRetry(options),
() => simplifyQueryAndRetry(options),
() => reformulateQueryForSchema(options),
() => tryAlternativeAPIFormat(options)
];
for (const strategy of strategies) {
try {
console.log('Attempting 422 error recovery strategy...');
const results = await strategy();
if (results && !results.error) {
return {
success: true,
data: results,
message: 'Successfully retrieved results after handling 422 schema error'
};
}
} catch (strategyError) {
console.log('422 recovery strategy failed:', strategyError.message);
continue;
}
}
return {
error: true,
message: `Failed to retrieve results after handling 422 schema error: ${error.message}`
};
}
/**
* Attempts to repair schema issues and retry
* @param {Object} options - Original search options
* @returns {Object} Search results
*/
async function repairSchemaAndRetry(options) {
console.log('Attempting schema repair...');
// Add missing input_schema if this is the issue
const repairedParams = {
...options,
input_schema: {
type: "web_search_20250305",
name: "web_search",
max_uses: 8
}
};
// Add delay before retry
await new Promise(resolve => setTimeout(resolve, 1000));
return await contentExtractor.tavily.search(repairedParams);
}
/**
* Simplifies the query to avoid schema validation issues
* @param {Object} options - Original search options
* @returns {Object} Search results
*/
async function simplifyQueryAndRetry(options) {
console.log('Simplifying query for schema compatibility...');
const simplifiedQuery = simplifyQueryForSchema(options.query);
const simplifiedParams = {
...options,
query: simplifiedQuery,
max_results: Math.min(options.max_results || 10, 5), // Reduce complexity
search_depth: "basic" // Use simpler search mode
};
await new Promise(resolve => setTimeout(resolve, 1500));
return await contentExtractor.tavily.search(simplifiedParams);
}
/**
* Reformulates query specifically for schema issues
* @param {Object} options - Original search options
* @returns {Object} Search results
*/
async function reformulateQueryForSchema(options) {
console.log('Reformulating query for schema compatibility...');
const reformulatedQuery = reformulateQueryForSchemaCompatibility(options.query);
const reformulatedParams = {
...options,
query: reformulatedQuery,
include_answer: false, // Simplify request
include_raw_content: false
};
await new Promise(resolve => setTimeout(resolve, 2000));
return await contentExtractor.tavily.search(reformulatedParams);
}
/**
* Tries alternative API format
* @param {Object} options - Original search options
* @returns {Object} Search results
*/
async function tryAlternativeAPIFormat(options) {
console.log('Trying alternative API format...');
// Try with minimal parameters
const minimalParams = {
query: options.query,
api_key: options.api_key,
search_depth: "basic"
};
await new Promise(resolve => setTimeout(resolve, 3000));
return await contentExtractor.tavily.search(minimalParams);
}
/**
* Simplifies query for schema compatibility
* @param {string} query - Original query
* @returns {string} Simplified query
*/
function simplifyQueryForSchema(query) {
return query
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/[^\w\s\-.,!?]/g, '') // Remove special characters except basic punctuation
.substring(0, 200) // Limit length
.trim();
}
/**
* Reformulates query specifically for schema compatibility issues
* @param {string} query - Original query
* @returns {string} Reformulated query
*/
function reformulateQueryForSchemaCompatibility(query) {
// Break down complex queries into simpler components
const words = query.split(' ').filter(word => word.length > 2);
if (words.length > 8) {
// If query is too long, use the most important terms
return words.slice(0, 6).join(' ');
}
// Replace problematic patterns
return query
.replace(/\d{4}/g, '') // Remove years
.replace(/github|gitlab|bitbucket/gi, 'code repository') // Replace specific platforms
.replace(/open source|open-source/gi, 'free software') // Simplify terminology
.replace(/platform|boilerplate|framework/gi, 'software') // Generic terms
.trim();
}
/**
* Reformulates a query to potentially bypass filters
* @param {string} query - Original query
* @returns {string} Reformulated query
*/
function reformulateQuery(query) {
// Simple reformulation - could be enhanced with more sophisticated NLP
const synonyms = {
'how to': 'guide for',
'what is': 'information about',
'why is': 'reason for',
'when did': 'date of'
};
let reformulated = query;
for (const [original, replacement] of Object.entries(synonyms)) {
reformulated = reformulated.replace(new RegExp(original, 'gi'), replacement);
}
return reformulated;
}
// Export additional functions for testing
export {
classify451Failure,
validateRecoveryTimeout,
createStandardErrorResponse,
createStandardSuccessResponse
};

458
hooks/handle-web-search.mjs Normal file
View File

@@ -0,0 +1,458 @@
// hooks/handle-web-search.mjs
import { tavily, extractContent } from './content-extractor.mjs';
import { handleWebSearchError } from './handle-search-error.mjs';
// Configuration for environment variable namespacing
const TAVILY_API_KEY = process.env.SEARCH_PLUS_TAVILY_API_KEY || process.env.TAVILY_API_KEY || null;
const JINAAI_API_KEY = process.env.SEARCH_PLUS_JINAAI_API_KEY || process.env.JINAAI_API_KEY || null;
// Show deprecation warnings for old variable names
if (!process.env.SEARCH_PLUS_TAVILY_API_KEY && process.env.TAVILY_API_KEY) {
console.warn('⚠️ TAVILY_API_KEY is deprecated. Please update to SEARCH_PLUS_TAVILY_API_KEY');
}
if (!process.env.SEARCH_PLUS_JINAAI_API_KEY && process.env.JINAAI_API_KEY) {
console.warn('⚠️ JINAAI_API_KEY is deprecated. Please update to SEARCH_PLUS_JINAAI_API_KEY');
}
/**
* Detects if the input is a URL
* @param {string} input - The input to check
* @returns {boolean} True if the input is a URL
*/
function isURL(input) {
try {
const url = new URL(input);
return url.protocol === 'http:' || url.protocol === 'https:';
} catch {
return false;
}
}
/**
* Handles web search requests with enhanced error handling
* @param {Object} params - Search parameters
* @returns {Object} Search results or error information
*/
export async function handleWebSearch(params) {
const query = params.query || params.q || '';
const maxRetries = params.maxRetries || 3;
const timeout = params.timeout || 10000; // 10 seconds default
if (!query) {
return {
error: true,
message: 'No search query or URL provided'
};
}
// Check if the query is a URL and handle extraction
if (isURL(query)) {
console.log(`🔍 Extracting content from URL: ${query}`);
const result = await handleURLExtraction(query, { maxRetries, timeout });
// Provide brief status feedback
if (result.success) {
console.log(`✅ URL extraction completed successfully`);
} else {
console.log(`❌ URL extraction failed: ${result.message}`);
}
return result;
}
// Provide status feedback for search queries
if (!isURL(query)) {
console.log(`🔍 Searching: ${query}`);
}
// Use hybrid search strategy
try {
const searchParams = {
query,
maxResults: params.maxResults || 5,
includeAnswer: params.includeAnswer !== false,
includeRawContent: params.includeRawContent || false,
headers: generateRandomHeaders()
};
const result = await performHybridSearch(searchParams, timeout);
return {
success: true,
data: result.data,
service: result.service,
attempt: 1
};
} catch (error) {
console.error('All search strategies failed:', error.message);
// Final error handling for recovery attempts
const errorResult = await handleWebSearchError(error, {
query,
maxResults: params.maxResults || 5,
includeAnswer: params.includeAnswer || true,
includeRawContent: params.includeRawContent || false,
headers: generateRandomHeaders(),
timeout,
attempt: 1,
error: error
});
if (errorResult && errorResult.success) {
return {
success: true,
data: errorResult.data,
attempt: 1,
errorRecovered: true,
originalError: error.message,
recoveryMessage: errorResult.message
};
}
return {
error: true,
message: errorResult?.message || error.message,
attempt: 1,
errorHandlingApplied: true
};
}
}
/**
* Hybrid web search with intelligent service selection
* Sequential: Tavily → Parallel free services
* Note: Jina API is only used for URL extraction, not web search
*/
async function performHybridSearch(params, timeoutMs = 10000) {
// Phase 1: Try Tavily API (premium service)
if (TAVILY_API_KEY) {
try {
console.log('🚀 Trying Tavily API...');
const result = await tavily.search(params, timeoutMs);
return { data: result, service: 'tavily' };
} catch (error) {
console.log('🔄 Tavily failed, trying free services...');
}
}
// Phase 2: Parallel execution for free services
console.log('🌐 Trying all free search engines in parallel...');
const freeStrategies = [
trySearXNGSearch(params, timeoutMs),
tryDuckDuckGoHTML(params, timeoutMs),
tryStartpageHTML(params, timeoutMs)
];
try {
const result = await Promise.any(freeStrategies);
console.log(`✅ Success with free service: ${result.service}`);
return result;
} catch (aggregateError) {
throw new Error('All search services failed. Try again or configure Tavily API key for enhanced reliability.');
}
}
/**
* Attempts search using SearXNG metasearch engine
*/
async function trySearXNGSearch(params, timeoutMs = 10000) {
const searxngInstances = [
'https://search.brave.works',
'https://searx.be',
'https://searx.tiekoetter.com',
'https://search.snopyta.org'
];
const query = encodeURIComponent(params.query);
const maxResults = params.maxResults || 5;
for (const instance of searxngInstances) {
try {
const searchUrl = `${instance}/search?q=${query}&format=json&engines=google,duckduckgo,startpage&results=${maxResults}`;
const response = await fetch(searchUrl, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Referer': instance,
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
...params.headers
},
signal: AbortSignal.timeout(timeoutMs)
});
if (!response.ok) {
continue; // Try next instance
}
const data = await response.json();
if (!data.results || data.results.length === 0) {
continue; // Try next instance
}
// Transform SearXNG results to Tavily-like format
const transformedResults = {
results: data.results.slice(0, maxResults).map((item, index) => ({
title: item.title,
url: item.url,
content: item.content || '',
score: 1.0 - (index * 0.1), // Simple scoring
published_date: item.publishedDate || null
})),
answer: data.answers?.[0] || null,
query: params.query,
response_time: Date.now() - performance.now()
};
return { data: transformedResults, service: 'searxng' };
} catch (error) {
console.log(`❌ SearXNG instance ${instance} failed: ${error.message}`);
continue; // Try next instance
}
}
throw new Error('All SearXNG instances failed');
}
/**
* Attempts search using DuckDuckGo HTML parsing
*/
async function tryDuckDuckGoHTML(params, timeoutMs = 10000) {
const query = encodeURIComponent(params.query);
const maxResults = params.maxResults || 5;
const searchUrl = `https://html.duckduckgo.com/html/?q=${query}&kl=us-en`;
const response = await fetch(searchUrl, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
...params.headers
},
signal: AbortSignal.timeout(timeoutMs)
});
if (!response.ok) {
throw new Error(`DuckDuckGo HTML error: ${response.status}`);
}
const html = await response.text();
// Parse HTML results
const results = [];
const resultRegex = /<div class="result">[\s\S]*?<a rel="nofollow" class="result__a" href="([^"]+)">([^<]+)<\/a>[\s\S]*?<a class="result__snippet" href="[^"]*">([^<]*)<\/a>/g;
let match;
while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
const [, url, title, snippet] = match;
if (url && title && !url.includes('//r.jina.ai/http')) { // Filter out redirect links
results.push({
title: title.trim(),
url: url.startsWith('http') ? url : `https:${url}`,
content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
score: 1.0 - (results.length * 0.1)
});
}
}
if (results.length === 0) {
throw new Error('No results found in DuckDuckGo HTML response');
}
const transformedResults = {
results,
answer: null, // DuckDuckGo doesn't provide instant answers in HTML mode
query: params.query,
response_time: Date.now() - performance.now()
};
return { data: transformedResults, service: 'duckduckgo-html' };
}
/**
* Attempts search using Startpage HTML parsing
*/
async function tryStartpageHTML(params, timeoutMs = 10000) {
const query = encodeURIComponent(params.query);
const maxResults = params.maxResults || 5;
const searchUrl = `https://www.startpage.com/do/search?query=${query}&cat=web&pl=ext-ff&extVersion=1.3.0`;
const response = await fetch(searchUrl, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
...params.headers
},
signal: AbortSignal.timeout(timeoutMs)
});
if (!response.ok) {
throw new Error(`Startpage HTML error: ${response.status}`);
}
const html = await response.text();
// Parse HTML results (Startpage format)
const results = [];
const resultRegex = /<h3><a href="([^"]+)"[^>]*>([^<]+)<\/a><\/h3>[\s\S]*?<p class="snippet">([^<]*)<\/p>/g;
let match;
while ((match = resultRegex.exec(html)) !== null && results.length < maxResults) {
const [, url, title, snippet] = match;
if (url && title) {
results.push({
title: title.trim(),
url: url.startsWith('http') ? url : `https:${url}`,
content: snippet ? snippet.replace(/<[^>]*>/g, '').trim() : '',
score: 1.0 - (results.length * 0.1)
});
}
}
if (results.length === 0) {
throw new Error('No results found in Startpage HTML response');
}
const transformedResults = {
results,
answer: null,
query: params.query,
response_time: Date.now() - performance.now()
};
return { data: transformedResults, service: 'startpage-html' };
}
/**
* Generate random headers to avoid detection
* @returns {Object} Random headers object
*/
function generateRandomHeaders() {
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
];
return {
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
};
}
/**
* Determines if an error is retryable
* @param {Error} error - The error to check
* @returns {boolean} True if the error is retryable
*/
function isRetryableError(error) {
// 403, 422, 429, 451, ECONNREFUSED, ETIMEDOUT are retryable
const errorMessage = error.message || '';
const errorString = JSON.stringify(error);
return error.code === 403 ||
error.code === 422 ||
error.code === 429 ||
error.code === 451 ||
error.code === 'ECONNREFUSED' ||
error.code === 'ETIMEDOUT' ||
errorMessage.includes('403') ||
errorMessage.includes('422') ||
errorMessage.includes('429') ||
errorMessage.includes('451') ||
errorMessage.includes('SecurityCompromiseError') ||
errorMessage.includes('blocked until') ||
errorMessage.includes('ECONNREFUSED') ||
errorMessage.includes('ETIMEDOUT') ||
// Check for schema validation patterns
errorString.toLowerCase().includes('missing') ||
errorString.toLowerCase().includes('input_schema') ||
errorString.toLowerCase().includes('field required');
}
/**
* Handles URL extraction with retry logic
* @param {string} url - The URL to extract content from
* @param {Object} options - Extraction options
* @returns {Object} Extraction results or error information
*/
async function handleURLExtraction(url, options = {}) {
const { maxRetries = 3, timeout = 15000 } = options;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
// Add random delay to avoid rate limiting
if (attempt > 0) {
const delay = Math.min(1000 * Math.pow(2, attempt), 8000); // Exponential backoff up to 8s
await new Promise(resolve => setTimeout(resolve, delay));
}
// Try to extract content with custom headers
const extractOptions = {
headers: generateRandomHeaders(),
includeImages: false, // Don't include images by default for faster processing
...options
};
const results = await extractContent(url, extractOptions);
return {
success: true,
data: results,
attempt: attempt + 1,
isURLExtraction: true
};
} catch (error) {
console.error(`URL extraction attempt ${attempt + 1} failed:`, error.message);
// Check if it's a retryable error
if (attempt === maxRetries || !isRetryableError(error)) {
return {
error: true,
message: `Failed to extract content from URL: ${error.message}`,
attempt: attempt + 1,
isURLExtraction: true
};
}
// Continue to next attempt
}
}
}

16
hooks/hooks.json Normal file
View File

@@ -0,0 +1,16 @@
{
"hooks": {
"PostToolUse": [
{
"matcher": "WebSearch|WebFetch",
"hooks": [
{
"type": "command",
"command": "node ${CLAUDE_PLUGIN_ROOT}/hooks/handle-web-search.mjs",
"timeout": 30
}
]
}
]
}
}