400 lines
11 KiB
TypeScript
400 lines
11 KiB
TypeScript
/**
|
|
* OpenAI Moderation API - Content Safety Examples
|
|
*
|
|
* This template demonstrates:
|
|
* - Basic content moderation
|
|
* - All 11 safety categories
|
|
* - Category scoring and thresholds
|
|
* - Batch moderation
|
|
* - Production patterns for UGC
|
|
*/
|
|
|
|
import OpenAI from 'openai';
|
|
|
|
const openai = new OpenAI({
|
|
apiKey: process.env.OPENAI_API_KEY,
|
|
});
|
|
|
|
// =============================================================================
|
|
// BASIC MODERATION
|
|
// =============================================================================
|
|
|
|
async function basicModeration() {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: 'I want to hurt someone.',
|
|
});
|
|
|
|
const result = moderation.results[0];
|
|
|
|
console.log('Flagged:', result.flagged);
|
|
console.log('Categories flagged:');
|
|
Object.entries(result.categories).forEach(([category, flagged]) => {
|
|
if (flagged) {
|
|
console.log(` - ${category}: ${result.category_scores[category].toFixed(4)}`);
|
|
}
|
|
});
|
|
|
|
return result;
|
|
}
|
|
|
|
// =============================================================================
|
|
// ALL SAFETY CATEGORIES
|
|
// =============================================================================
|
|
|
|
async function allCategories() {
|
|
const examples = {
|
|
sexual: 'Explicit sexual content example',
|
|
hate: 'Hateful speech based on identity',
|
|
harassment: 'Bullying and intimidation example',
|
|
'self-harm': 'Content promoting self-harm',
|
|
'sexual/minors': 'Any sexualization of minors',
|
|
'hate/threatening': 'Violent threats based on identity',
|
|
'violence/graphic': 'Graphic violence description',
|
|
'self-harm/intent': 'Active suicidal ideation',
|
|
'self-harm/instructions': 'How to self-harm instructions',
|
|
'harassment/threatening': 'Violent threat toward individual',
|
|
violence: 'General violent content',
|
|
};
|
|
|
|
console.log('Safety Categories:\n');
|
|
|
|
for (const [category, description] of Object.entries(examples)) {
|
|
console.log(`${category}:`);
|
|
console.log(` Description: ${description}`);
|
|
}
|
|
|
|
console.log('\nNote: Use actual content for real moderation, not descriptions');
|
|
}
|
|
|
|
// =============================================================================
|
|
// CATEGORY SCORES AND THRESHOLDS
|
|
// =============================================================================
|
|
|
|
interface Thresholds {
|
|
[key: string]: number;
|
|
}
|
|
|
|
const recommendedThresholds: Thresholds = {
|
|
sexual: 0.5,
|
|
hate: 0.4,
|
|
harassment: 0.5,
|
|
'self-harm': 0.3,
|
|
'sexual/minors': 0.1, // Lower threshold for child safety
|
|
'hate/threatening': 0.3,
|
|
'violence/graphic': 0.5,
|
|
'self-harm/intent': 0.2,
|
|
'self-harm/instructions': 0.2,
|
|
'harassment/threatening': 0.3,
|
|
violence: 0.5,
|
|
};
|
|
|
|
function checkThresholds(result: any, thresholds: Thresholds): boolean {
|
|
return Object.entries(result.category_scores).some(
|
|
([category, score]) => score > (thresholds[category] || 0.5)
|
|
);
|
|
}
|
|
|
|
async function withCustomThresholds(text: string) {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: text,
|
|
});
|
|
|
|
const result = moderation.results[0];
|
|
|
|
const isFlagged = checkThresholds(result, recommendedThresholds);
|
|
|
|
console.log('Content:', text);
|
|
console.log('API flagged:', result.flagged);
|
|
console.log('Custom thresholds flagged:', isFlagged);
|
|
|
|
if (isFlagged) {
|
|
console.log('Flagged categories:');
|
|
Object.entries(result.category_scores).forEach(([category, score]) => {
|
|
const threshold = recommendedThresholds[category] || 0.5;
|
|
if (score > threshold) {
|
|
console.log(` - ${category}: ${score.toFixed(4)} (threshold: ${threshold})`);
|
|
}
|
|
});
|
|
}
|
|
|
|
return { result, isFlagged };
|
|
}
|
|
|
|
// =============================================================================
|
|
// BATCH MODERATION
|
|
// =============================================================================
|
|
|
|
async function batchModeration() {
|
|
const texts = [
|
|
'This is a normal, safe comment',
|
|
'Potentially harmful content example',
|
|
'Another safe piece of text',
|
|
];
|
|
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: texts,
|
|
});
|
|
|
|
moderation.results.forEach((result, index) => {
|
|
console.log(`\nInput ${index + 1}: "${texts[index]}"`);
|
|
console.log('Flagged:', result.flagged);
|
|
|
|
if (result.flagged) {
|
|
const flaggedCategories = Object.keys(result.categories).filter(
|
|
cat => result.categories[cat]
|
|
);
|
|
console.log('Categories:', flaggedCategories.join(', '));
|
|
}
|
|
});
|
|
|
|
return moderation.results;
|
|
}
|
|
|
|
// =============================================================================
|
|
// PRODUCTION PATTERN - UGC MODERATION
|
|
// =============================================================================
|
|
|
|
interface ModerationDecision {
|
|
allowed: boolean;
|
|
reason?: string;
|
|
severity?: 'low' | 'medium' | 'high' | 'error';
|
|
scores?: any;
|
|
}
|
|
|
|
async function moderateUserContent(userInput: string): Promise<ModerationDecision> {
|
|
try {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: userInput,
|
|
});
|
|
|
|
const result = moderation.results[0];
|
|
|
|
// Immediate block for severe categories
|
|
const severeCategories = [
|
|
'sexual/minors',
|
|
'self-harm/intent',
|
|
'hate/threatening',
|
|
'harassment/threatening',
|
|
];
|
|
|
|
for (const category of severeCategories) {
|
|
if (result.categories[category]) {
|
|
return {
|
|
allowed: false,
|
|
reason: `Content violates policy: ${category}`,
|
|
severity: 'high',
|
|
};
|
|
}
|
|
}
|
|
|
|
// High-confidence violence check
|
|
if (result.category_scores.violence > 0.8) {
|
|
return {
|
|
allowed: false,
|
|
reason: 'High-confidence violence detected',
|
|
severity: 'medium',
|
|
};
|
|
}
|
|
|
|
// Self-harm content requires human review
|
|
if (result.categories['self-harm']) {
|
|
return {
|
|
allowed: false,
|
|
reason: 'Content flagged for human review',
|
|
severity: 'medium',
|
|
};
|
|
}
|
|
|
|
// Allow content
|
|
return {
|
|
allowed: true,
|
|
scores: result.category_scores,
|
|
};
|
|
} catch (error: any) {
|
|
console.error('Moderation error:', error);
|
|
|
|
// Fail closed: block on error
|
|
return {
|
|
allowed: false,
|
|
reason: 'Moderation service unavailable',
|
|
severity: 'error',
|
|
};
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// CATEGORY-SPECIFIC FILTERING
|
|
// =============================================================================
|
|
|
|
async function filterByCategory(text: string, categoriesToCheck: string[]) {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: text,
|
|
});
|
|
|
|
const result = moderation.results[0];
|
|
|
|
const violations = categoriesToCheck.filter(
|
|
category => result.categories[category]
|
|
);
|
|
|
|
if (violations.length > 0) {
|
|
console.log('Content violates:', violations.join(', '));
|
|
return false;
|
|
}
|
|
|
|
console.log('Content passed specified category checks');
|
|
return true;
|
|
}
|
|
|
|
// =============================================================================
|
|
// LOGGING AND AUDIT TRAIL
|
|
// =============================================================================
|
|
|
|
interface ModerationLog {
|
|
timestamp: string;
|
|
content: string;
|
|
flagged: boolean;
|
|
categories: string[];
|
|
scores: any;
|
|
action: 'allowed' | 'blocked' | 'review';
|
|
}
|
|
|
|
async function moderateWithLogging(content: string): Promise<ModerationLog> {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: content,
|
|
});
|
|
|
|
const result = moderation.results[0];
|
|
|
|
const flaggedCategories = Object.keys(result.categories).filter(
|
|
cat => result.categories[cat]
|
|
);
|
|
|
|
const log: ModerationLog = {
|
|
timestamp: new Date().toISOString(),
|
|
content: content.substring(0, 100), // Truncate for logging
|
|
flagged: result.flagged,
|
|
categories: flaggedCategories,
|
|
scores: result.category_scores,
|
|
action: result.flagged ? 'blocked' : 'allowed',
|
|
};
|
|
|
|
// In production: save to database or logging service
|
|
console.log('Moderation log:', JSON.stringify(log, null, 2));
|
|
|
|
return log;
|
|
}
|
|
|
|
// =============================================================================
|
|
// USER FEEDBACK PATTERN
|
|
// =============================================================================
|
|
|
|
function getUserFriendlyMessage(result: any): string {
|
|
if (!result.flagged) {
|
|
return 'Content approved';
|
|
}
|
|
|
|
const flaggedCategories = Object.keys(result.categories).filter(
|
|
cat => result.categories[cat]
|
|
);
|
|
|
|
// Don't reveal exact detection details
|
|
if (flaggedCategories.some(cat => cat.includes('harm'))) {
|
|
return 'Your content appears to contain concerning material. Please review our community guidelines.';
|
|
}
|
|
|
|
if (flaggedCategories.includes('harassment') || flaggedCategories.includes('hate')) {
|
|
return 'Your content may be disrespectful or harmful to others. Please rephrase.';
|
|
}
|
|
|
|
if (flaggedCategories.includes('violence')) {
|
|
return 'Your content contains violent themes that violate our policies.';
|
|
}
|
|
|
|
return 'Your content doesn\'t meet our community guidelines. Please revise and try again.';
|
|
}
|
|
|
|
// =============================================================================
|
|
// ERROR HANDLING
|
|
// =============================================================================
|
|
|
|
async function withErrorHandling(text: string) {
|
|
try {
|
|
const moderation = await openai.moderations.create({
|
|
model: 'omni-moderation-latest',
|
|
input: text,
|
|
});
|
|
|
|
return moderation.results[0];
|
|
} catch (error: any) {
|
|
if (error.status === 401) {
|
|
console.error('Invalid API key');
|
|
} else if (error.status === 429) {
|
|
console.error('Rate limit exceeded - implement retry logic');
|
|
} else if (error.status === 500) {
|
|
console.error('OpenAI service error - fail closed and block content');
|
|
} else {
|
|
console.error('Unexpected error:', error.message);
|
|
}
|
|
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// MAIN EXECUTION
|
|
// =============================================================================
|
|
|
|
async function main() {
|
|
console.log('=== OpenAI Moderation API Examples ===\n');
|
|
|
|
// Example 1: Basic moderation
|
|
console.log('1. Basic Moderation:');
|
|
await basicModeration();
|
|
console.log();
|
|
|
|
// Example 2: All categories
|
|
console.log('2. All Safety Categories:');
|
|
allCategories();
|
|
console.log();
|
|
|
|
// Example 3: Custom thresholds
|
|
console.log('3. Custom Thresholds:');
|
|
await withCustomThresholds('This is a test message');
|
|
console.log();
|
|
|
|
// Example 4: Batch moderation
|
|
console.log('4. Batch Moderation:');
|
|
await batchModeration();
|
|
console.log();
|
|
|
|
// Example 5: Production pattern
|
|
console.log('5. Production UGC Moderation:');
|
|
const decision = await moderateUserContent('Safe user comment');
|
|
console.log('Decision:', decision);
|
|
console.log();
|
|
}
|
|
|
|
// Run if executed directly
|
|
if (require.main === module) {
|
|
main().catch(console.error);
|
|
}
|
|
|
|
export {
|
|
basicModeration,
|
|
allCategories,
|
|
withCustomThresholds,
|
|
batchModeration,
|
|
moderateUserContent,
|
|
filterByCategory,
|
|
moderateWithLogging,
|
|
getUserFriendlyMessage,
|
|
withErrorHandling,
|
|
};
|