Initial commit
This commit is contained in:
62
skills/extract_from_pdfs/assets/api_config_template.json
Normal file
62
skills/extract_from_pdfs/assets/api_config_template.json
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"_comment": "Configuration for API validation in step 05",
|
||||
"_instructions": [
|
||||
"Specify which external APIs to use for validating/enriching each field",
|
||||
"Available APIs:",
|
||||
" - gbif_taxonomy: GBIF for biological taxonomy",
|
||||
" - wfo_plants: World Flora Online for plant names",
|
||||
" - geonames: GeoNames for geographic locations (requires account)",
|
||||
" - geocode: OpenStreetMap Nominatim for geocoding",
|
||||
" - pubchem: PubChem for chemical compounds",
|
||||
" - ncbi_gene: NCBI Gene database",
|
||||
"Customize the field_mappings below based on your extraction schema"
|
||||
],
|
||||
|
||||
"field_mappings": {
|
||||
"_example_species_field": {
|
||||
"api": "gbif_taxonomy",
|
||||
"output_field": "validated_species",
|
||||
"description": "Validate species names against GBIF"
|
||||
},
|
||||
|
||||
"_example_location_field": {
|
||||
"api": "geocode",
|
||||
"output_field": "geocoded_location",
|
||||
"description": "Geocode location to lat/lon coordinates"
|
||||
},
|
||||
|
||||
"_example_compound_field": {
|
||||
"api": "pubchem",
|
||||
"output_field": "validated_compound",
|
||||
"description": "Validate chemical compound names"
|
||||
}
|
||||
},
|
||||
|
||||
"nested_field_mappings": {
|
||||
"_comment": "For fields nested in 'records' array",
|
||||
"_example": "records.species would validate the 'species' field within each record",
|
||||
|
||||
"records.species": {
|
||||
"api": "gbif_taxonomy",
|
||||
"output_field": "validated_species"
|
||||
},
|
||||
|
||||
"records.location": {
|
||||
"api": "geocode",
|
||||
"output_field": "coordinates"
|
||||
}
|
||||
},
|
||||
|
||||
"api_specific_settings": {
|
||||
"geonames": {
|
||||
"_note": "Requires free account at geonames.org",
|
||||
"_setup": "Set GEONAMES_USERNAME environment variable"
|
||||
},
|
||||
|
||||
"rate_limits": {
|
||||
"_comment": "Be respectful of API rate limits",
|
||||
"default_delay_seconds": 0.5,
|
||||
"nominatim_delay_seconds": 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
{
|
||||
"_comment": "Example API configuration for ecology/flower visitor research",
|
||||
"_note": "This shows how to validate taxonomic and geographic data",
|
||||
|
||||
"field_mappings": {
|
||||
"plant_species": {
|
||||
"api": "wfo_plants",
|
||||
"output_field": "validated_plant_taxonomy",
|
||||
"description": "Validate plant species names against World Flora Online"
|
||||
},
|
||||
|
||||
"country": {
|
||||
"api": "geonames",
|
||||
"output_field": "validated_country",
|
||||
"description": "Validate and standardize country names"
|
||||
}
|
||||
},
|
||||
|
||||
"nested_field_mappings": {
|
||||
"_comment": "These apply to fields within the 'records' array",
|
||||
|
||||
"records.plant_species": {
|
||||
"api": "wfo_plants",
|
||||
"output_field": "validated_plant_taxonomy",
|
||||
"extra_params": {}
|
||||
},
|
||||
|
||||
"records.country": {
|
||||
"api": "geonames",
|
||||
"output_field": "geocoded_country"
|
||||
},
|
||||
|
||||
"records.locality": {
|
||||
"api": "geocode",
|
||||
"output_field": "coordinates",
|
||||
"description": "Get coordinates for field sites"
|
||||
}
|
||||
},
|
||||
|
||||
"validation_rules": {
|
||||
"plant_species": {
|
||||
"required": true,
|
||||
"validate_taxonomy": true,
|
||||
"accept_genus_only": false
|
||||
},
|
||||
|
||||
"visitors": {
|
||||
"type": "array",
|
||||
"min_items": 1,
|
||||
"validate_items": false,
|
||||
"_note": "Visitor names as-written, not validated against taxonomy"
|
||||
},
|
||||
|
||||
"location_completeness": {
|
||||
"require_country": true,
|
||||
"require_coordinates": false,
|
||||
"_note": "Country is required but exact coordinates are optional"
|
||||
}
|
||||
},
|
||||
|
||||
"api_settings": {
|
||||
"retry_on_failure": true,
|
||||
"max_retries": 3,
|
||||
"timeout_seconds": 10,
|
||||
|
||||
"rate_limits": {
|
||||
"wfo": 1.0,
|
||||
"geonames": 0.5,
|
||||
"nominatim": 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
{
|
||||
"_comment": "Example extraction schema based on flower visitor ecology research",
|
||||
"_note": "This is a real-world example. Copy and adapt for your own domain.",
|
||||
|
||||
"objective": "carefully analyze this paper and extract empirical observations of flower visitors",
|
||||
|
||||
"system_context": "You are a scientific research assistant specializing in analyzing papers about plant-pollinator interactions. Your task is to analyze scientific papers and extract structured data for a meta-analysis of flower visitation.",
|
||||
|
||||
"instructions": [
|
||||
"Determine if the paper contains any empirical observations of flower visitors",
|
||||
"If empirical observations are present, extract all records of flower visitors",
|
||||
"Each record should represent observations of one plant species in one locality"
|
||||
],
|
||||
|
||||
"analysis_steps": [
|
||||
"1. Identify and quote relevant sections of the paper that contain empirical primary observations of flower visitors. If there is no primary data, explain why and do not create any records.",
|
||||
"2. List out each plant species mentioned in these observations. Consider species as the smallest taxonomic unit for plants. If there are multiple varieties or subspecies, summarize all records for the same species as a single record.",
|
||||
"3. For each plant species, extract the required information: location, method of observation, time of observation, and list of ALL flower visitors (be comprehensive)",
|
||||
"4. Assess whether any visitors or pollinators are beetles (Coleoptera). For each visitor, classify as 'Beetle' or 'Non-beetle'",
|
||||
"5. Evaluate whether the methods are unbiased by checking observation times and methods",
|
||||
"6. Double-check your findings for accuracy and completeness"
|
||||
],
|
||||
|
||||
"important_notes": [
|
||||
"Only include PRIMARY observations from the paper. Do not consider secondary data or citations",
|
||||
"If a record involves more than one plant species or country, separate it into multiple records",
|
||||
"Do not add any variables to the output that are not explicitly listed in the schema",
|
||||
"Do not use external information to update taxonomic names. List names as they appear in the source",
|
||||
"If anything is unknown, use 'none' or empty lists as appropriate",
|
||||
"Always include all records in the response, even if it ends up being extremely long"
|
||||
],
|
||||
|
||||
"output_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"has_primary_visitor_data": {
|
||||
"type": "boolean",
|
||||
"description": "Whether there are primary observations about flower visitors in this study"
|
||||
},
|
||||
"has_visitor_notes": {
|
||||
"type": "string",
|
||||
"description": "Brief explanation of evidence supporting the assessment"
|
||||
},
|
||||
"response_truncated": {
|
||||
"type": "boolean",
|
||||
"description": "Whether there were too many records to retrieve comprehensively"
|
||||
},
|
||||
"noteworthy_beetle_fact": {
|
||||
"type": "string",
|
||||
"description": "One or two sentences summarizing noteworthy facts about beetles discovered in this study"
|
||||
},
|
||||
"beetle_pollen_feeders": {
|
||||
"type": "boolean",
|
||||
"description": "Whether the paper mentions any beetle pollen feeder as adult"
|
||||
},
|
||||
"beetle_nectar_feeders": {
|
||||
"type": "boolean",
|
||||
"description": "Whether the paper mentions any beetle drinking nectar as adult"
|
||||
},
|
||||
"beetle_florivores": {
|
||||
"type": "boolean",
|
||||
"description": "Whether beetles damage flower parts other than pollen and nectar"
|
||||
},
|
||||
"beetle_larval_breeding": {
|
||||
"type": "boolean",
|
||||
"description": "Whether beetle larvae feed on parts of the same plant visited by adults"
|
||||
},
|
||||
"records": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"country": {
|
||||
"type": "string",
|
||||
"description": "Country name"
|
||||
},
|
||||
"state_province": {
|
||||
"type": "string",
|
||||
"description": "State or province name"
|
||||
},
|
||||
"locality": {
|
||||
"type": "string",
|
||||
"description": "Specific location of the observation"
|
||||
},
|
||||
"plant_species": {
|
||||
"type": "string",
|
||||
"description": "Plant species name"
|
||||
},
|
||||
"method": {
|
||||
"type": "string",
|
||||
"description": "One-sentence description of observation methods"
|
||||
},
|
||||
"observation_time": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["day", "night", "dawn", "dusk"]
|
||||
},
|
||||
"description": "List of observation times"
|
||||
},
|
||||
"visitors": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of all flower visitors observed"
|
||||
},
|
||||
"beetle_families": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of beetle families mentioned as flower visitors"
|
||||
},
|
||||
"beetle_visitors": {
|
||||
"type": "boolean",
|
||||
"description": "Whether beetles were found as flower visitors"
|
||||
},
|
||||
"beetle_pollinators": {
|
||||
"type": "boolean",
|
||||
"description": "Whether beetles were found as significant pollinators"
|
||||
},
|
||||
"methods_unbiased": {
|
||||
"type": "boolean",
|
||||
"description": "Whether methods appear to be unbiased"
|
||||
},
|
||||
"methods_biased_reasoning": {
|
||||
"type": "string",
|
||||
"description": "One-sentence explanation for bias assessment"
|
||||
}
|
||||
},
|
||||
"required": ["country", "plant_species", "method", "visitors"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["has_primary_visitor_data", "records"]
|
||||
},
|
||||
|
||||
"output_example": {
|
||||
"has_primary_visitor_data": true,
|
||||
"has_visitor_notes": "Paper reports direct field observations of flower visitors across multiple species",
|
||||
"response_truncated": false,
|
||||
"noteworthy_beetle_fact": "Beetles from the family Scarabaeidae were observed as frequent visitors and effective pollen carriers",
|
||||
"beetle_pollen_feeders": true,
|
||||
"beetle_nectar_feeders": false,
|
||||
"beetle_florivores": false,
|
||||
"beetle_larval_breeding": false,
|
||||
"records": [
|
||||
{
|
||||
"country": "Brazil",
|
||||
"state_province": "São Paulo",
|
||||
"locality": "Parque Estadual da Serra do Mar",
|
||||
"plant_species": "Magnolia ovata",
|
||||
"method": "Direct observation of floral visitors during anthesis over 3 days",
|
||||
"observation_time": ["day", "night"],
|
||||
"visitors": [
|
||||
"Cyclocephala paraguayensis (Coleoptera: Scarabaeidae)",
|
||||
"Apis mellifera (Hymenoptera: Apidae)",
|
||||
"Trigona spinipes (Hymenoptera: Apidae)"
|
||||
],
|
||||
"beetle_families": ["Scarabaeidae"],
|
||||
"beetle_visitors": true,
|
||||
"beetle_pollinators": true,
|
||||
"methods_unbiased": true,
|
||||
"methods_biased_reasoning": "Observations conducted during both day and night, allowing detection of nocturnal visitors"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
105
skills/extract_from_pdfs/assets/schema_template.json
Normal file
105
skills/extract_from_pdfs/assets/schema_template.json
Normal file
@@ -0,0 +1,105 @@
|
||||
{
|
||||
"_comment": "This is a template extraction schema. Customize for your specific use case.",
|
||||
"_instructions": "Fill in the sections below with your specific extraction requirements.",
|
||||
|
||||
"objective": "carefully analyze this paper and extract [DESCRIBE YOUR DATA TYPE, e.g., 'empirical observations of X', 'experimental measurements of Y', etc.]",
|
||||
|
||||
"system_context": "You are a scientific research assistant specializing in [YOUR DOMAIN, e.g., 'ecology', 'chemistry', 'medicine', etc.]. Your task is to analyze scientific papers and extract structured data for systematic review and meta-analysis.",
|
||||
|
||||
"instructions": [
|
||||
"Determine if the paper contains [YOUR CRITERIA, e.g., 'primary empirical data']",
|
||||
"If present, extract all [YOUR RECORD TYPE, e.g., 'observation records', 'measurements', 'outcomes']",
|
||||
"For each record, extract the following information: [LIST KEY FIELDS]"
|
||||
],
|
||||
|
||||
"analysis_steps": [
|
||||
"1. Identify and quote relevant sections containing [YOUR DATA TYPE]",
|
||||
"2. List out each [RECORD UNIT, e.g., 'species', 'compound', 'patient cohort']",
|
||||
"3. For each unit, extract required information and quote supporting text",
|
||||
"4. [ADD DOMAIN-SPECIFIC VALIDATION STEPS]",
|
||||
"5. Double-check for accuracy and completeness"
|
||||
],
|
||||
|
||||
"important_notes": [
|
||||
"Only include PRIMARY data from this paper, not secondary sources",
|
||||
"If a record involves multiple [UNITS], separate into individual records",
|
||||
"Do not add fields not in the schema",
|
||||
"Use 'none' or empty lists for unknown values",
|
||||
"List names exactly as they appear in the source"
|
||||
],
|
||||
|
||||
"output_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"has_relevant_data": {
|
||||
"type": "boolean",
|
||||
"description": "Whether the paper contains the target data type"
|
||||
},
|
||||
"data_description": {
|
||||
"type": "string",
|
||||
"description": "Brief explanation of what data is present"
|
||||
},
|
||||
"records": {
|
||||
"type": "array",
|
||||
"description": "List of extracted records",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"_comment": "CUSTOMIZE THESE FIELDS FOR YOUR USE CASE",
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "Geographic location (if applicable)"
|
||||
},
|
||||
"subject": {
|
||||
"type": "string",
|
||||
"description": "Main subject of the record (species, compound, etc.)"
|
||||
},
|
||||
"measurement_type": {
|
||||
"type": "string",
|
||||
"description": "Type of measurement or observation"
|
||||
},
|
||||
"value": {
|
||||
"type": ["number", "string"],
|
||||
"description": "Measured or observed value"
|
||||
},
|
||||
"units": {
|
||||
"type": "string",
|
||||
"description": "Units of measurement"
|
||||
},
|
||||
"method": {
|
||||
"type": "string",
|
||||
"description": "Brief description of methodology"
|
||||
},
|
||||
"sample_size": {
|
||||
"type": "integer",
|
||||
"description": "Sample size if applicable"
|
||||
},
|
||||
"notes": {
|
||||
"type": "string",
|
||||
"description": "Additional relevant notes"
|
||||
}
|
||||
},
|
||||
"required": ["subject"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["has_relevant_data", "records"]
|
||||
},
|
||||
|
||||
"output_example": {
|
||||
"has_relevant_data": true,
|
||||
"data_description": "Paper reports 5 observation records across 3 locations",
|
||||
"records": [
|
||||
{
|
||||
"location": "Example Location",
|
||||
"subject": "Example Subject",
|
||||
"measurement_type": "Example Type",
|
||||
"value": 42.5,
|
||||
"units": "mg/L",
|
||||
"method": "Brief methodology description",
|
||||
"sample_size": 20,
|
||||
"notes": "Any relevant notes"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user