Initial commit
This commit is contained in:
425
templates/metadata-filtering.ts
Normal file
425
templates/metadata-filtering.ts
Normal file
@@ -0,0 +1,425 @@
|
||||
/**
|
||||
* Advanced Metadata Filtering Examples for Cloudflare Vectorize
|
||||
*
|
||||
* Use case: Multi-tenant apps, complex filtering, range queries, nested metadata
|
||||
*
|
||||
* Features:
|
||||
* - All filter operators ($eq, $ne, $in, $nin, $lt, $lte, $gt, $gte)
|
||||
* - Nested metadata with dot notation
|
||||
* - Namespace-based isolation
|
||||
* - Combined filters (implicit AND)
|
||||
* - Range queries on numbers and strings
|
||||
* - Performance optimization tips
|
||||
*/
|
||||
|
||||
export interface Env {
|
||||
VECTORIZE_INDEX: VectorizeIndex;
|
||||
AI: Ai;
|
||||
}
|
||||
|
||||
interface FilterExample {
|
||||
name: string;
|
||||
description: string;
|
||||
filter: Record<string, any>;
|
||||
namespace?: string;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env, ctx: ExecutionContext): Promise<Response> {
|
||||
// Handle CORS
|
||||
if (request.method === 'OPTIONS') {
|
||||
return new Response(null, {
|
||||
headers: {
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
|
||||
'Access-Control-Allow-Headers': 'Content-Type',
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const url = new URL(request.url);
|
||||
|
||||
// Route: GET /examples - Show all filter examples
|
||||
if (url.pathname === '/examples' && request.method === 'GET') {
|
||||
const examples: FilterExample[] = [
|
||||
{
|
||||
name: 'Equality (implicit)',
|
||||
description: 'Find vectors with exact category match',
|
||||
filter: { category: 'documentation' },
|
||||
},
|
||||
{
|
||||
name: 'Equality (explicit)',
|
||||
description: 'Explicit $eq operator',
|
||||
filter: { category: { $eq: 'documentation' } },
|
||||
},
|
||||
{
|
||||
name: 'Not Equals',
|
||||
description: 'Exclude archived documents',
|
||||
filter: { status: { $ne: 'archived' } },
|
||||
},
|
||||
{
|
||||
name: 'In Array',
|
||||
description: 'Match any of multiple categories',
|
||||
filter: { category: { $in: ['docs', 'tutorials', 'guides'] } },
|
||||
},
|
||||
{
|
||||
name: 'Not In Array',
|
||||
description: 'Exclude multiple statuses',
|
||||
filter: { status: { $nin: ['archived', 'draft', 'deleted'] } },
|
||||
},
|
||||
{
|
||||
name: 'Greater Than',
|
||||
description: 'Documents published after date',
|
||||
filter: { published_at: { $gt: 1704067200 } }, // Jan 1, 2024
|
||||
},
|
||||
{
|
||||
name: 'Less Than or Equal',
|
||||
description: 'Documents published before or on date',
|
||||
filter: { published_at: { $lte: 1735689600 } }, // Jan 1, 2025
|
||||
},
|
||||
{
|
||||
name: 'Range Query (numbers)',
|
||||
description: 'Documents published in 2024',
|
||||
filter: {
|
||||
published_at: {
|
||||
$gte: 1704067200, // >= Jan 1, 2024
|
||||
$lt: 1735689600, // < Jan 1, 2025
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'Range Query (strings - prefix search)',
|
||||
description: 'URLs starting with /docs/workers/',
|
||||
filter: {
|
||||
url: {
|
||||
$gte: '/docs/workers/',
|
||||
$lt: '/docs/workersz', // 'z' is after all possible chars
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'Nested Metadata',
|
||||
description: 'Filter by nested author ID',
|
||||
filter: { 'author.id': 'user123' },
|
||||
},
|
||||
{
|
||||
name: 'Combined Filters (AND)',
|
||||
description: 'Multiple conditions (implicit AND)',
|
||||
filter: {
|
||||
category: 'docs',
|
||||
language: 'en',
|
||||
published: true,
|
||||
published_at: { $gte: 1704067200 },
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'Multi-tenant (namespace)',
|
||||
description: 'Isolate by customer ID using namespace',
|
||||
namespace: 'customer-abc123',
|
||||
filter: { type: 'support_ticket' },
|
||||
},
|
||||
{
|
||||
name: 'Boolean Filter',
|
||||
description: 'Published documents only',
|
||||
filter: { published: true },
|
||||
},
|
||||
{
|
||||
name: 'Complex Multi-field',
|
||||
description: 'Docs in English, published in 2024, not archived',
|
||||
filter: {
|
||||
category: { $in: ['docs', 'tutorials'] },
|
||||
language: 'en',
|
||||
status: { $ne: 'archived' },
|
||||
published_at: { $gte: 1704067200, $lt: 1735689600 },
|
||||
'author.verified': true,
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
return Response.json({ examples });
|
||||
}
|
||||
|
||||
// Route: POST /search/filtered - Execute filtered search
|
||||
if (url.pathname === '/search/filtered' && request.method === 'POST') {
|
||||
try {
|
||||
const body = await request.json() as {
|
||||
query: string;
|
||||
exampleName?: string;
|
||||
filter?: Record<string, any>;
|
||||
namespace?: string;
|
||||
topK?: number;
|
||||
};
|
||||
|
||||
const { query, exampleName, filter, namespace, topK = 5 } = body;
|
||||
|
||||
if (!query) {
|
||||
return Response.json({ error: 'Missing required field: query' }, { status: 400 });
|
||||
}
|
||||
|
||||
// If exampleName provided, use pre-defined filter
|
||||
let finalFilter = filter;
|
||||
let finalNamespace = namespace;
|
||||
|
||||
if (exampleName) {
|
||||
const examplesResponse = await this.fetch(
|
||||
new Request(new URL('/examples', request.url)),
|
||||
env,
|
||||
ctx
|
||||
);
|
||||
const { examples } = (await examplesResponse.json()) as { examples: FilterExample[] };
|
||||
|
||||
const example = examples.find((ex) => ex.name === exampleName);
|
||||
if (example) {
|
||||
finalFilter = example.filter;
|
||||
finalNamespace = example.namespace || namespace;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate embedding
|
||||
const embedding = await env.AI.run('@cf/baai/bge-base-en-v1.5', {
|
||||
text: query,
|
||||
});
|
||||
|
||||
// Query with filter
|
||||
const results = await env.VECTORIZE_INDEX.query(embedding.data[0], {
|
||||
topK,
|
||||
filter: finalFilter,
|
||||
namespace: finalNamespace,
|
||||
returnMetadata: 'all',
|
||||
returnValues: false,
|
||||
});
|
||||
|
||||
return Response.json({
|
||||
query,
|
||||
filter: finalFilter,
|
||||
namespace: finalNamespace,
|
||||
results: results.matches.map((m) => ({
|
||||
id: m.id,
|
||||
score: m.score,
|
||||
metadata: m.metadata,
|
||||
})),
|
||||
count: results.count,
|
||||
}, {
|
||||
headers: { 'Access-Control-Allow-Origin': '*' },
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Filtered search error:', error);
|
||||
return Response.json(
|
||||
{
|
||||
error: 'Filtered search failed',
|
||||
message: error instanceof Error ? error.message : 'Unknown error',
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Route: POST /seed - Seed example data with rich metadata
|
||||
if (url.pathname === '/seed' && request.method === 'POST') {
|
||||
try {
|
||||
// Sample documents with diverse metadata
|
||||
const sampleDocs = [
|
||||
{
|
||||
content: 'Cloudflare Workers are serverless functions that run on the edge.',
|
||||
metadata: {
|
||||
category: 'documentation',
|
||||
language: 'en',
|
||||
status: 'published',
|
||||
published_at: 1704153600, // Jan 2, 2024
|
||||
published: true,
|
||||
url: '/docs/workers/intro',
|
||||
author: { id: 'user123', name: 'John Doe', verified: true },
|
||||
tags: ['workers', 'serverless', 'edge'],
|
||||
},
|
||||
},
|
||||
{
|
||||
content: 'Vectorize is a globally distributed vector database.',
|
||||
metadata: {
|
||||
category: 'documentation',
|
||||
language: 'en',
|
||||
status: 'published',
|
||||
published_at: 1720310400, // Jul 7, 2024
|
||||
published: true,
|
||||
url: '/docs/vectorize/intro',
|
||||
author: { id: 'user456', name: 'Jane Smith', verified: true },
|
||||
tags: ['vectorize', 'database', 'ai'],
|
||||
},
|
||||
},
|
||||
{
|
||||
content: 'D1 is Cloudflare\'s serverless SQL database.',
|
||||
metadata: {
|
||||
category: 'tutorials',
|
||||
language: 'en',
|
||||
status: 'draft',
|
||||
published_at: 1735603200, // Dec 31, 2024
|
||||
published: false,
|
||||
url: '/tutorials/d1/getting-started',
|
||||
author: { id: 'user123', name: 'John Doe', verified: true },
|
||||
tags: ['d1', 'database', 'sql'],
|
||||
},
|
||||
},
|
||||
{
|
||||
content: 'R2 provides S3-compatible object storage without egress fees.',
|
||||
metadata: {
|
||||
category: 'guides',
|
||||
language: 'en',
|
||||
status: 'published',
|
||||
published_at: 1712880000, // Apr 12, 2024
|
||||
published: true,
|
||||
url: '/docs/r2/overview',
|
||||
author: { id: 'user789', name: 'Bob Wilson', verified: false },
|
||||
tags: ['r2', 'storage', 'object-storage'],
|
||||
},
|
||||
},
|
||||
{
|
||||
content: 'Workers KV is a key-value store for edge applications.',
|
||||
metadata: {
|
||||
category: 'documentation',
|
||||
language: 'en',
|
||||
status: 'archived',
|
||||
published_at: 1640995200, // Jan 1, 2022
|
||||
published: true,
|
||||
url: '/docs/kv/intro',
|
||||
author: { id: 'user456', name: 'Jane Smith', verified: true },
|
||||
tags: ['kv', 'storage', 'edge'],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
// Generate embeddings
|
||||
const texts = sampleDocs.map((doc) => doc.content);
|
||||
const embeddings = await env.AI.run('@cf/baai/bge-base-en-v1.5', { text: texts });
|
||||
|
||||
// Prepare vectors
|
||||
const vectors = sampleDocs.map((doc, i) => ({
|
||||
id: `sample-${i + 1}`,
|
||||
values: embeddings.data[i],
|
||||
metadata: {
|
||||
content: doc.content,
|
||||
...doc.metadata,
|
||||
indexed_at: Date.now(),
|
||||
},
|
||||
}));
|
||||
|
||||
// Upsert all
|
||||
await env.VECTORIZE_INDEX.upsert(vectors);
|
||||
|
||||
return Response.json({
|
||||
success: true,
|
||||
message: 'Seeded 5 sample documents with rich metadata',
|
||||
count: vectors.length,
|
||||
}, {
|
||||
headers: { 'Access-Control-Allow-Origin': '*' },
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Seed error:', error);
|
||||
return Response.json(
|
||||
{
|
||||
error: 'Seeding failed',
|
||||
message: error instanceof Error ? error.message : 'Unknown error',
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Default: API documentation
|
||||
return Response.json({
|
||||
name: 'Metadata Filtering Examples API',
|
||||
endpoints: {
|
||||
'GET /examples': {
|
||||
description: 'List all filter examples with syntax',
|
||||
},
|
||||
'POST /search/filtered': {
|
||||
description: 'Execute filtered vector search',
|
||||
body: {
|
||||
query: 'string (required)',
|
||||
exampleName: 'string (optional) - use pre-defined filter',
|
||||
filter: 'object (optional) - custom filter',
|
||||
namespace: 'string (optional)',
|
||||
topK: 'number (optional, default: 5)',
|
||||
},
|
||||
example: {
|
||||
query: 'serverless database',
|
||||
exampleName: 'Range Query (numbers)',
|
||||
},
|
||||
},
|
||||
'POST /seed': {
|
||||
description: 'Seed database with example documents',
|
||||
note: 'Creates 5 sample documents with rich metadata for testing',
|
||||
},
|
||||
},
|
||||
filterOperators: {
|
||||
$eq: 'Equals',
|
||||
$ne: 'Not equals',
|
||||
$in: 'In array',
|
||||
$nin: 'Not in array',
|
||||
$lt: 'Less than',
|
||||
$lte: 'Less than or equal',
|
||||
$gt: 'Greater than',
|
||||
$gte: 'Greater than or equal',
|
||||
},
|
||||
notes: {
|
||||
'Metadata Keys': 'Cannot be empty, contain dots (.), quotes ("), or start with $',
|
||||
'Filter Size': 'Max 2048 bytes (compact JSON)',
|
||||
'Cardinality': 'High cardinality in range queries can impact performance',
|
||||
'Namespace': 'Applied BEFORE metadata filters',
|
||||
},
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Example Usage:
|
||||
*
|
||||
* 1. Seed example data:
|
||||
*
|
||||
* curl -X POST https://your-worker.workers.dev/seed
|
||||
*
|
||||
* 2. List filter examples:
|
||||
*
|
||||
* curl https://your-worker.workers.dev/examples
|
||||
*
|
||||
* 3. Search with pre-defined filter:
|
||||
*
|
||||
* curl -X POST https://your-worker.workers.dev/search/filtered \
|
||||
* -H "Content-Type: application/json" \
|
||||
* -d '{
|
||||
* "query": "database storage",
|
||||
* "exampleName": "Range Query (numbers)"
|
||||
* }'
|
||||
*
|
||||
* 4. Search with custom filter:
|
||||
*
|
||||
* curl -X POST https://your-worker.workers.dev/search/filtered \
|
||||
* -H "Content-Type: application/json" \
|
||||
* -d '{
|
||||
* "query": "edge computing",
|
||||
* "filter": {
|
||||
* "category": { "$in": ["docs", "tutorials"] },
|
||||
* "language": "en",
|
||||
* "status": { "$ne": "archived" },
|
||||
* "author.verified": true
|
||||
* },
|
||||
* "topK": 3
|
||||
* }'
|
||||
*
|
||||
* Performance Tips:
|
||||
*
|
||||
* 1. Low Cardinality for Range Queries:
|
||||
* ✅ Good: published_at (timestamps in seconds, not milliseconds)
|
||||
* ❌ Bad: user_id (millions of unique values in range)
|
||||
*
|
||||
* 2. Namespace First:
|
||||
* Use namespace for partition key (customer_id, tenant_id)
|
||||
* Then use metadata filters for finer-grained filtering
|
||||
*
|
||||
* 3. Filter Size:
|
||||
* Keep filters under 2048 bytes
|
||||
* If hitting limit, split into multiple queries
|
||||
*
|
||||
* 4. Indexed Metadata:
|
||||
* Create metadata indexes BEFORE inserting vectors:
|
||||
* npx wrangler vectorize create-metadata-index my-index \
|
||||
* --property-name=category --type=string
|
||||
*/
|
||||
Reference in New Issue
Block a user