Files
gh-jezweb-claude-skills-ski…/templates/metadata-filtering.ts
2025-11-30 08:24:34 +08:00

426 lines
12 KiB
TypeScript

/**
* Advanced Metadata Filtering Examples for Cloudflare Vectorize
*
* Use case: Multi-tenant apps, complex filtering, range queries, nested metadata
*
* Features:
* - All filter operators ($eq, $ne, $in, $nin, $lt, $lte, $gt, $gte)
* - Nested metadata with dot notation
* - Namespace-based isolation
* - Combined filters (implicit AND)
* - Range queries on numbers and strings
* - Performance optimization tips
*/
export interface Env {
VECTORIZE_INDEX: VectorizeIndex;
AI: Ai;
}
interface FilterExample {
name: string;
description: string;
filter: Record<string, any>;
namespace?: string;
}
export default {
async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
// Handle CORS
if (request.method === 'OPTIONS') {
return new Response(null, {
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type',
},
});
}
const url = new URL(request.url);
// Route: GET /examples - Show all filter examples
if (url.pathname === '/examples' && request.method === 'GET') {
const examples: FilterExample[] = [
{
name: 'Equality (implicit)',
description: 'Find vectors with exact category match',
filter: { category: 'documentation' },
},
{
name: 'Equality (explicit)',
description: 'Explicit $eq operator',
filter: { category: { $eq: 'documentation' } },
},
{
name: 'Not Equals',
description: 'Exclude archived documents',
filter: { status: { $ne: 'archived' } },
},
{
name: 'In Array',
description: 'Match any of multiple categories',
filter: { category: { $in: ['docs', 'tutorials', 'guides'] } },
},
{
name: 'Not In Array',
description: 'Exclude multiple statuses',
filter: { status: { $nin: ['archived', 'draft', 'deleted'] } },
},
{
name: 'Greater Than',
description: 'Documents published after date',
filter: { published_at: { $gt: 1704067200 } }, // Jan 1, 2024
},
{
name: 'Less Than or Equal',
description: 'Documents published before or on date',
filter: { published_at: { $lte: 1735689600 } }, // Jan 1, 2025
},
{
name: 'Range Query (numbers)',
description: 'Documents published in 2024',
filter: {
published_at: {
$gte: 1704067200, // >= Jan 1, 2024
$lt: 1735689600, // < Jan 1, 2025
},
},
},
{
name: 'Range Query (strings - prefix search)',
description: 'URLs starting with /docs/workers/',
filter: {
url: {
$gte: '/docs/workers/',
$lt: '/docs/workersz', // 'z' is after all possible chars
},
},
},
{
name: 'Nested Metadata',
description: 'Filter by nested author ID',
filter: { 'author.id': 'user123' },
},
{
name: 'Combined Filters (AND)',
description: 'Multiple conditions (implicit AND)',
filter: {
category: 'docs',
language: 'en',
published: true,
published_at: { $gte: 1704067200 },
},
},
{
name: 'Multi-tenant (namespace)',
description: 'Isolate by customer ID using namespace',
namespace: 'customer-abc123',
filter: { type: 'support_ticket' },
},
{
name: 'Boolean Filter',
description: 'Published documents only',
filter: { published: true },
},
{
name: 'Complex Multi-field',
description: 'Docs in English, published in 2024, not archived',
filter: {
category: { $in: ['docs', 'tutorials'] },
language: 'en',
status: { $ne: 'archived' },
published_at: { $gte: 1704067200, $lt: 1735689600 },
'author.verified': true,
},
},
];
return Response.json({ examples });
}
// Route: POST /search/filtered - Execute filtered search
if (url.pathname === '/search/filtered' && request.method === 'POST') {
try {
const body = await request.json() as {
query: string;
exampleName?: string;
filter?: Record<string, any>;
namespace?: string;
topK?: number;
};
const { query, exampleName, filter, namespace, topK = 5 } = body;
if (!query) {
return Response.json({ error: 'Missing required field: query' }, { status: 400 });
}
// If exampleName provided, use pre-defined filter
let finalFilter = filter;
let finalNamespace = namespace;
if (exampleName) {
const examplesResponse = await this.fetch(
new Request(new URL('/examples', request.url)),
env,
ctx
);
const { examples } = (await examplesResponse.json()) as { examples: FilterExample[] };
const example = examples.find((ex) => ex.name === exampleName);
if (example) {
finalFilter = example.filter;
finalNamespace = example.namespace || namespace;
}
}
// Generate embedding
const embedding = await env.AI.run('@cf/baai/bge-base-en-v1.5', {
text: query,
});
// Query with filter
const results = await env.VECTORIZE_INDEX.query(embedding.data[0], {
topK,
filter: finalFilter,
namespace: finalNamespace,
returnMetadata: 'all',
returnValues: false,
});
return Response.json({
query,
filter: finalFilter,
namespace: finalNamespace,
results: results.matches.map((m) => ({
id: m.id,
score: m.score,
metadata: m.metadata,
})),
count: results.count,
}, {
headers: { 'Access-Control-Allow-Origin': '*' },
});
} catch (error) {
console.error('Filtered search error:', error);
return Response.json(
{
error: 'Filtered search failed',
message: error instanceof Error ? error.message : 'Unknown error',
},
{ status: 500 }
);
}
}
// Route: POST /seed - Seed example data with rich metadata
if (url.pathname === '/seed' && request.method === 'POST') {
try {
// Sample documents with diverse metadata
const sampleDocs = [
{
content: 'Cloudflare Workers are serverless functions that run on the edge.',
metadata: {
category: 'documentation',
language: 'en',
status: 'published',
published_at: 1704153600, // Jan 2, 2024
published: true,
url: '/docs/workers/intro',
author: { id: 'user123', name: 'John Doe', verified: true },
tags: ['workers', 'serverless', 'edge'],
},
},
{
content: 'Vectorize is a globally distributed vector database.',
metadata: {
category: 'documentation',
language: 'en',
status: 'published',
published_at: 1720310400, // Jul 7, 2024
published: true,
url: '/docs/vectorize/intro',
author: { id: 'user456', name: 'Jane Smith', verified: true },
tags: ['vectorize', 'database', 'ai'],
},
},
{
content: 'D1 is Cloudflare\'s serverless SQL database.',
metadata: {
category: 'tutorials',
language: 'en',
status: 'draft',
published_at: 1735603200, // Dec 31, 2024
published: false,
url: '/tutorials/d1/getting-started',
author: { id: 'user123', name: 'John Doe', verified: true },
tags: ['d1', 'database', 'sql'],
},
},
{
content: 'R2 provides S3-compatible object storage without egress fees.',
metadata: {
category: 'guides',
language: 'en',
status: 'published',
published_at: 1712880000, // Apr 12, 2024
published: true,
url: '/docs/r2/overview',
author: { id: 'user789', name: 'Bob Wilson', verified: false },
tags: ['r2', 'storage', 'object-storage'],
},
},
{
content: 'Workers KV is a key-value store for edge applications.',
metadata: {
category: 'documentation',
language: 'en',
status: 'archived',
published_at: 1640995200, // Jan 1, 2022
published: true,
url: '/docs/kv/intro',
author: { id: 'user456', name: 'Jane Smith', verified: true },
tags: ['kv', 'storage', 'edge'],
},
},
];
// Generate embeddings
const texts = sampleDocs.map((doc) => doc.content);
const embeddings = await env.AI.run('@cf/baai/bge-base-en-v1.5', { text: texts });
// Prepare vectors
const vectors = sampleDocs.map((doc, i) => ({
id: `sample-${i + 1}`,
values: embeddings.data[i],
metadata: {
content: doc.content,
...doc.metadata,
indexed_at: Date.now(),
},
}));
// Upsert all
await env.VECTORIZE_INDEX.upsert(vectors);
return Response.json({
success: true,
message: 'Seeded 5 sample documents with rich metadata',
count: vectors.length,
}, {
headers: { 'Access-Control-Allow-Origin': '*' },
});
} catch (error) {
console.error('Seed error:', error);
return Response.json(
{
error: 'Seeding failed',
message: error instanceof Error ? error.message : 'Unknown error',
},
{ status: 500 }
);
}
}
// Default: API documentation
return Response.json({
name: 'Metadata Filtering Examples API',
endpoints: {
'GET /examples': {
description: 'List all filter examples with syntax',
},
'POST /search/filtered': {
description: 'Execute filtered vector search',
body: {
query: 'string (required)',
exampleName: 'string (optional) - use pre-defined filter',
filter: 'object (optional) - custom filter',
namespace: 'string (optional)',
topK: 'number (optional, default: 5)',
},
example: {
query: 'serverless database',
exampleName: 'Range Query (numbers)',
},
},
'POST /seed': {
description: 'Seed database with example documents',
note: 'Creates 5 sample documents with rich metadata for testing',
},
},
filterOperators: {
$eq: 'Equals',
$ne: 'Not equals',
$in: 'In array',
$nin: 'Not in array',
$lt: 'Less than',
$lte: 'Less than or equal',
$gt: 'Greater than',
$gte: 'Greater than or equal',
},
notes: {
'Metadata Keys': 'Cannot be empty, contain dots (.), quotes ("), or start with $',
'Filter Size': 'Max 2048 bytes (compact JSON)',
'Cardinality': 'High cardinality in range queries can impact performance',
'Namespace': 'Applied BEFORE metadata filters',
},
});
},
};
/**
* Example Usage:
*
* 1. Seed example data:
*
* curl -X POST https://your-worker.workers.dev/seed
*
* 2. List filter examples:
*
* curl https://your-worker.workers.dev/examples
*
* 3. Search with pre-defined filter:
*
* curl -X POST https://your-worker.workers.dev/search/filtered \
* -H "Content-Type: application/json" \
* -d '{
* "query": "database storage",
* "exampleName": "Range Query (numbers)"
* }'
*
* 4. Search with custom filter:
*
* curl -X POST https://your-worker.workers.dev/search/filtered \
* -H "Content-Type: application/json" \
* -d '{
* "query": "edge computing",
* "filter": {
* "category": { "$in": ["docs", "tutorials"] },
* "language": "en",
* "status": { "$ne": "archived" },
* "author.verified": true
* },
* "topK": 3
* }'
*
* Performance Tips:
*
* 1. Low Cardinality for Range Queries:
* ✅ Good: published_at (timestamps in seconds, not milliseconds)
* ❌ Bad: user_id (millions of unique values in range)
*
* 2. Namespace First:
* Use namespace for partition key (customer_id, tenant_id)
* Then use metadata filters for finer-grained filtering
*
* 3. Filter Size:
* Keep filters under 2048 bytes
* If hitting limit, split into multiple queries
*
* 4. Indexed Metadata:
* Create metadata indexes BEFORE inserting vectors:
* npx wrangler vectorize create-metadata-index my-index \
* --property-name=category --type=string
*/