Files
gh-ai4curation-curation-ski…/skills/editing-obo-ontologies/obo-checkin.pl
2025-11-29 17:51:42 +08:00

202 lines
4.8 KiB
Perl
Executable File

#!/usr/bin/perl -w
use strict;
use FileHandle;
my $outdir = "terms";
my $cmd;
my $dry_run = 0;
my $preserve_files = 0;
while ($ARGV[0] =~ /^\-/) {
my $opt = shift @ARGV;
if ($opt eq '-h' || $opt eq '--help') {
print usage();
exit 0;
}
if ($opt eq '-d' || $opt eq '--outdir') {
$outdir = shift @ARGV;
}
if ($opt eq '-n' || $opt eq '--dry-run') {
$dry_run = 1;
}
if ($opt eq '-p' || $opt eq '--preserve-files') {
$preserve_files = 1;
}
}
`mkdir -p $outdir`;
my $id;
my $stanza = "";
my @alt_ids = ();
my $fn = shift @ARGV;
# ensure ids are sorted
my @ids = sort @ARGV;
my %new_stanza_map = ();
foreach my $id (@ids) {
my $path = get_path($id);
# check if $id is a path to a file that exists
if ($id =~ m@[\./]@ && -e $id) {
open(F, $id) || die "no such file $id";
my @lines = <F>;
close(F);
my $uber_stanza = join("", @lines);
my @stanzas_in_block = split(/\n\n/, $uber_stanza);
foreach my $stanza (@stanzas_in_block) {
# trim whitespace
$stanza =~ s/\s+$//;
if (!length($stanza)) {
next;
}
# check if stanza has id (note that stanza is multi-line)
if ($stanza =~ /id:\s+(\S+)/) {
my $stanza_id = $1;
$new_stanza_map{$stanza_id} = "$stanza\n\n";
}
else {
die "no id found in $stanza";
}
}
}
else {
open(F, $path) || die "no such file $path";
my $stanza = "";
while(<F>) {
chomp;
$stanza .= "$_\n";
}
close(F);
if ($stanza =~ /id: (\S+)/) {
# check id matches
if ($1 ne $id) {
die "id mismatch $1 ne $id";
}
}
else {
die "no id found in $path";
}
$new_stanza_map{$id} = $stanza;
}
}
open(W, ">$fn.tmp") || die "cannot write tp $fn.tmp";
my %stanza_map = ();
my %stanza_type_map = (); # To track stanza type (Term or Typedef)
$/ = "\n\n";
open(F, $fn) || die "cannot open $fn";
while(<F>) {
if ($_ =~ /id: (\S+)/) {
my $id = $1;
$stanza_map{$id} = $_;
# Determine stanza type
if ($_ =~ /\[(\w+)\]/) {
$stanza_type_map{$id} = $1;
}
else {
# Default to Term if type not specified
$stanza_type_map{$id} = "Term";
}
}
else {
print W $_;
}
}
close(F);
# combine old and new stanzas
foreach my $id (sort keys %new_stanza_map) {
$stanza_map{$id} = $new_stanza_map{$id};
# Update stanza type for new stanzas
if ($new_stanza_map{$id} =~ /\[(\w+)\]/) {
my $s = $1;
$stanza_type_map{$id} = $s;
}
else {
# Default to Term if type not specified
$stanza_type_map{$id} = "Term";
}
}
# Sort ids by stanza type (Term first, then Typedef) and then alphabetically within each type
my @sorted_ids = sort {
# First compare stanza types (Term comes before Typedef)
my $type_compare = ($stanza_type_map{$a} eq "Typedef") <=> ($stanza_type_map{$b} eq "Typedef");
# If same type, sort alphabetically by ID
return $type_compare || $a cmp $b;
} keys %stanza_map;
foreach my $id (@sorted_ids) {
my $s = $stanza_map{$id};
# normalize line endings to strip trailing whitespace
$s =~ s@[\r\n]+$@\n\n@;
print W $s;
}
close(W);
if ($dry_run) {
print "dry run, no changes made\n";
}
else {
`mv $fn.tmp $fn`;
# clear out @ids from $outdir
foreach my $id (@ids) {
my $path = get_path($id);
if (!$preserve_files) {
unlink $path;
}
}
}
# get the path for an id
# the ID should be either:
# - an ontology curie, e.g. GO:0000001, in which case the path is terms/GO_0000001.obo
# - an OWL local name, e.g. GO_0000001, in which case the path is terms/GO_0000001.obo
# - a file name, e.g. terms/my_terms.obo, in which case the path is terms/my_terms.obo
sub get_path {
my ($id) = @_;
my $fn = "$id";
$fn =~ s@:@_@;
# if the id has : or / in it and is a path to a file that exists, return it
if ($fn =~ m@[\./]@ && -e $fn) {
return $fn;
}
return "$outdir/$fn.obo"
}
sub w {
my ($id, $stanza) = @_;
my $path = get_path($id);
open(F, ">$path") || die($path);
print F $stanza;
close(F)
}
sub scriptname {
my @p = split(/\//,$0);
pop @p;
}
sub usage {
my $sn = scriptname();
<<EOM;
$sn OBO-FILE [ -d TERM-DIR ] TERM1 TERM2 ...
Checks in obo files from TERM-DIR into the OBO-FILE
Example:
$sn src/ontology/foo-edit.obo FOO:0000087 FOO:0000081
This will check in the FOO:0000087 and FOO:0000081 terms from the terms directory
into the foo-edit.obo file.
EOM
}