codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#!/usr/bin/perl -w use strict; use Pod::Usage; use Getopt::Long; use Bio::SearchIO; use Bio::Chado::Schema; use Try::Tiny; use YAML qw/LoadFile/; use Log::Log4perl qw/:easy/; use Log::Log4perl::Appender; use Log::Log4perl::Layout::PatternLayout; my ($dsn, $user, $pass, $query_org, $query_type, $update, $config, $verbose, $sql_verbose ); my $query_parser = 'none'; my $hit_parser = 'none'; my $db_source = 'GFF_source'; my $source = 'dictyBase_blast'; my $seq_onto = 'sequence'; my $option = { LongReadLen => 2**15 }; GetOptions( 'h|help' => sub { pod2usage(1); }, 'qorg|query_org=s' => \$query_org, 'qtype|query_type:s' => \$query_type, 'dsn=s' => \$dsn, 'u|user=s' => \$user, 'p|pass|password=s' => \$pass, 'opt|dbopt:s' => \$option, 'qp|query_parser:s' => \$query_parser, 'hp|hit_parser:s' => \$hit_parser, 's|src|source:s' => \$source, 'so|seq_onto:s' => \$seq_onto, 'update' => \$update, 'db_src|db_source' => \$db_source, 'c|config:s' => \$config, 'v|verbose' => \$verbose, 'sql_verbose' => \$sql_verbose, ); if ($config) { my $str = LoadFile($config); my $db = $str->{database}; if ($db) { $dsn = $db->{dsn} || $dsn; $user = $db->{user} || $user; $pass = $db->{password} || $pass; } my $query = $str->{query}; if ($query) { $query_org = $query->{organism} || $query_org; $query_parser = $query->{parser} || $query_parser; } $hit_parser = $str->{target}->{parser} || $hit_parser; $seq_onto = $str->{so} || $seq_onto; $db_source = $str->{database_source} || $db_source; $source = $str->{source} || $source; } pod2usage("no blast alignment file is given") if !$ARGV[0]; my $logger = setup_logger() if $verbose; my %type_map = ( blastn => 'DNA', blastp => 'protein', tblastn => 'protein', blastx => 'DNA', tblastx => 'DNA', ); my %match_map = ( blastn => 'nucleotide_match', blastp => 'protein_match', tblastn => 'protein_match', blastx => 'translated_protein_match', tblastx => 'translated_protein_match', ); my %query_parser_map = ( 'ncbi' => sub { ( ( split /\|/, $_[0] ) )[1] }, 'regular' => sub { ( ( split /\|/, $_[0] ) )[0] }, 'dicty' => sub { my $id = ( ( split /\|/, $_[0] ) )[1]; $id =~ s/\s+$//; $id; }, 'none' => sub { my $id = $_[0]; $id =~ s/\s+$//; $id; } ); my %hit_parser_map = ( 'ncbi' => sub { ( ( split /\|/, $_[0] ) )[1] }, 'regular' => sub { ( ( split /\|/, $_[0] ) )[0] }, 'dicty' => sub { my $id = ( ( split /\|/, $_[0] ) )[0]; $id =~ s/\s+$//; $id; }, 'none' => sub { my $id = $_[0]; $id =~ s/\s+$//; $id; } ); my $searchio = Bio::SearchIO->new( -file => $ARGV[0], -format => 'blast' ); my $schema = Bio::Chado::Schema->connect( $dsn, $user, $pass ); $schema->storage->verbose(1) if $sql_verbose; #check if the sequence ontology namespace exists my $so = $schema->resultset('Cv::Cv')->find( { name => $seq_onto } ); pod2usage("sequence ontology namespace $seq_onto does not exist") if !$so; #get the dbsource my $db = $schema->resultset('General::Db')->find( { name => $db_source } ); if ( !$db ) { try { $db = $schema->txn_do( sub { $schema->resultset('General::Db') ->create( { name => $db_source } ); } ); } catch { die "unable to create source record $source $_"; }; } my $dbxref; try { $dbxref = $schema->txn_do( sub { $schema->resultset('General::Dbxref')->find_or_create( { accession => $source, db_id => $db->db_id } ); } ); } catch { die "unable to create dbxref record for source: $source $_ \n"; }; my $match_part = $schema->resultset('Cv::Cvterm')->find( { name => 'match_part', cv_id => $so->cv_id } ); pod2usage("unable to fetch *match_part* cvterm: please insert that first") if !$match_part; my $part_of = $schema->resultset('Cv::Cvterm') ->find( { name => 'part_of', cv_id => $so->cv_id }, ); my $description = $schema->resultset('Cv::Cvterm')->find( { name => 'description' } ); pod2usage("unable to find description cvterm in database") if !$description; #get the query organism my $organism = $schema->resultset('Organism::Organism')->search( { -or => [ 'common_name' => $query_org, 'abbreviation' => $query_org, 'species' => $query_org, ], }, { 'select' => [qw/species organism_id common_name/], rows => 1 } )->single; pod2usage("$organism organism does not exist in our database") if !$organism; my $analysis; my $match_type; RESULT: while ( my $result = $searchio->next_result ) { next RESULT if $result->no_hits_found; $analysis ||= $schema->resultset('Companalysis::Analysis')->find_or_create( { program => $result->algorithm, programversion => $result->algorithm_version, sourcename => $source, name => $result->algorithm . '_' . $source, description => $result->algorithm . ' alignment' } ); $query_type ||= $schema->resultset('Cv::Cvterm') ->find( { name => $type_map{ lc $result->algorithm }, cv_id => $so->cv_id } ); $match_type ||= $schema->resultset('Cv::Cvterm') ->find( { name => $match_map{ lc $result->algorithm }, cv_id => $so->cv_id } ); my $query_id = $query_parser_map{$query_parser}->( $result->query_name ); my $query_row = $schema->resultset('Sequence::Feature')->search( { -and => [ 'is_deleted' => 0, 'is_analysis' => 1, -or => [ 'dbxref.accession' => $query_id, 'uniquename' => $query_id, 'name' => $query_id ] ] }, { join => 'dbxref', select => [qw/feature_id type_id uniquename name/], rows => 1 } )->single; if ($query_row) { $logger->info("query record $query_id is present") if $verbose; $logger->info( $query_row->uniquename ); } else { my $create = sub { $schema->resultset('Sequence::Feature')->create( { uniquename => $query_id, name => $query_id, organism_id => $organism->organism_id, is_analysis => 1, type_id => $query_type->cvterm_id, dbxref => { accession => $query_id, db_id => $db->db_id }, } ); }; try { $query_row = $schema->txn_do($create); $logger->info("created record for query $query_id") if $verbose; } catch { warn "failed to create record for $query_id $_"; next RESULT; }; } remove_alignments( $schema, $query_row, $match_type->name ) if $update; HIT: while ( my $hit = $result->next_hit ) { my $hit_id = $hit_parser_map{$hit_parser}->( $hit->name ); my $target_row = $schema->resultset('Sequence::Feature')->search( { -and => [ 'is_deleted' => 0, -or => [ 'uniquename' => $hit_id, 'dbxref.accession' => $hit_id, 'name' => $hit_id ] ] }, { join => 'dbxref', 'select' => [qw/feature_id organism_id/], rows => 1 } )->single; if ( !$target_row ) { warn "unable to find hit for hit id: $hit_id\n"; next HIT; } #additional grouping of hsp's by the hit strand as in case of tblastn hsp #belonging to separate strand of query could be grouped into the same hit, however #they denotes separate matches and should be separated. my $hsp_group; while ( my $hsp = $hit->next_hsp ) { my $strand = $hsp->strand('hit') == 1 ? 'plus' : 'minus'; push @{ $hsp_group->{$strand} }, $hsp; } STRAND: foreach my $strand ( keys %$hsp_group ) { my $hit_value = uniq_hit_id( $query_id, $hit_id, $strand, $hit ); my $hit_create = sub { my $hit_row = $schema->resultset('Sequence::Feature')->create( { uniquename => $hit_value, name => $hit_value, organism_id => $target_row->organism_id, is_analysis => 1, seqlen => $hit->length, type_id => $match_type->cvterm_id, dbxref => { accession => $hit_value, db_id => $db->db_id }, analysisfeatures => [ { analysis_id => $analysis->analysis_id, rawscore => $hit->bits, normscore => $hit->score, significance => $hit->significance } ], feature_dbxrefs => [ { dbxref_id => $dbxref->dbxref_id } ] } ); my $floc_row = $schema->resultset('Sequence::Featureloc') ->create( { feature_id => $hit_row->feature_id } ); $floc_row->srcfeature_id( $target_row->feature_id ); $floc_row->rank(0); $floc_row->strand( $hit->strand('hit') ); $floc_row->fmin( $hit->start('hit') - 1 ); $floc_row->fmax( $hit->end('hit') ); $floc_row->update; $hit_row; }; my $hit_row; try { $hit_row = $schema->txn_do($hit_create); $logger->info("created hit record $hit_value") if $verbose; } catch { warn "cannot create record for hit $hit_id $_"; $logger->warn("cannot create record for hit $hit_id $_"); next STRAND; }; HSP: foreach my $hsp ( @{ $hsp_group->{$strand} } ) { my $hsp_id = uniq_hsp_id( $query_id, $hit_id, $strand, $hsp ); my $hsp_create = sub { $schema->resultset('Sequence::Feature')->create( { uniquename => $hsp_id, name => $hsp_id, organism_id => $hit_row->organism_id, is_analysis => 1, type_id => $match_part->cvterm_id, seqlen => $hsp->length, dbxref => { accession => $hsp_id, db_id => $db->db_id }, featureloc_feature_ids => [ { 'srcfeature_id' => $query_row->feature_id, 'rank' => 1, 'strand' => $hsp->strand('hit'), 'fmin' => $hsp->start('query') - 1, 'fmax' => $hsp->end('query'), }, { 'srcfeature_id' => $target_row->feature_id, 'rank' => 0, 'strand' => $hsp->strand('hit'), 'fmin' => $hsp->start('subject') - 1, 'fmax' => $hsp->end('subject'), }, ], analysisfeatures => [ { analysis_id => $analysis->analysis_id, rawscore => $hsp->bits, normscore => $hsp->score, significance => $hsp->significance, identity => $hsp->percent_identity, } ], feat_relationship_subject_ids => [ { type_id => $part_of->cvterm_id, object_id => $hit_row->feature_id, rank => $hsp->rank, }, ], } ); }; try { my $hsp_row = $schema->txn_do($hsp_create); $logger->info("created hsp record $hsp_id") if $verbose; } catch { warn "Unable to create hit record for $hit_id $_"; }; } } } } sub uniq_hsp_id { my ( $query, $hit, $strand, $hsp ) = @_; sprintf "%s:%s:%d..%d::%d..%d", $query, $hit, $hsp->start('query'), $hsp->end('query'), $hsp->start('subject'), $hsp->end('subject'); } sub uniq_hit_id { my ( $query_id, $hit_id, $strand, $hit ) = @_; sprintf "%s:%s-%s:%d..%d:%d..%d", $query_id, $hit_id, $strand, $hit->start('query'), $hit->end('query'), $hit->start('hit'), $hit->end('hit'); } sub remove_alignments { my ( $schema, $query, $hit_type ) = @_; #get all HSPs my $hsp_rs = $query->featureloc_srcfeature_ids->search( { 'rank' => 1 } ) ->search_related( 'feature', { 'type.name' => 'match_part', is_analysis => 1 }, { join => 'type' } ); if ( $hsp_rs->count == 0 ) { $logger->info( "No hsps for ", $query->uniquename ) if $verbose; return; } $logger->info( "updating alignment for ", $query->uniquename ) if $verbose; #get all Hits #If the same relationship name is being used it get aliased by DBIC and which should be #used my $hit_rs = $schema->resultset('Sequence::Feature')->search( { 'type.name' => $hit_type, 'type_2.name' => 'part_of', 'subject.feature_id' => { -in => [ map { $_->feature_id } $hsp_rs->all ] } }, { join => [ 'type', { 'feat_relationship_object_ids' => [ 'type', 'subject' ] } ] } ); try { $schema->txn_do( sub { foreach my $rs ( ( $hit_rs, $hsp_rs ) ) { my $dbxref_rs = $rs->search_related('dbxref'); $logger->info( "Going to delete dbxref ", $dbxref_rs->count, " record" ) if $verbose; $dbxref_rs->delete_all; $logger->info( 'Going to delete ', $rs->count, ' record' ) if $verbose; $rs->delete_all; } } ); } catch { warn 'unable to clean alignment for query ', $query->name, " $_\n"; return; }; } sub setup_logger { my $appender = Log::Log4perl::Appender->new( 'Log::Log4perl::Appender::ScreenColoredLevels', stderr => 1 ); my $layout = Log::Log4perl::Layout::PatternLayout->new( "[%d{MM-dd-yyyy hh:mm}] %p > %F{1}:%L - %m%n"); my $log = Log::Log4perl->get_logger(); $appender->layout($layout); $log->add_appender($appender); $log->level($DEBUG); $log; } sub setup_file_logger { my $appender = Log::Log4perl::Appender->new( 'Log::Log4perl::Appender::File', filename => 'load_alignment.log' ); my $layout = Log::Log4perl::Layout::PatternLayout->new( "[%d{MM-dd-yyyy hh:mm}] %p > %F{1}:%L - %m%n"); my $log = Log::Log4perl->get_logger(); $appender->layout($layout); $log->add_appender($appender); $log->level($DEBUG); $log; } =head1 NAME B<load_alignment.pl> - [Load blast alignment in chado database] =head1 SYNOPSIS perl load_alignment.pl -dsn "dbi:Oracle:host=localhost;sid=oraclesid" -u user -p pass -qorg worm blast_data.out`:w perl load_alignment.pl -dsn "dbi:Oracle:host=localhost;sid=oraclesid" -u user -p pass -qorg dicty -hp dicty -qp dicty blast_data.out perl load_alignment.pl -dsn "dbi:Pg:host=localhost;database=mygmod" -u user -p pass -qorg fly -hp ncbi -qp regular blast_data.out perl load_alignment.pl -dsn "dbi:Pg:host=localhost;database=mygmod" -u user -p pass -qorg fly --update blast_data.out perl load_alignment.pl -c config.yaml =head1 REQUIRED ARGUMENTS B<[-dsn|--dsn]> - dsn for the chado database, to know more about dsn string look at the documentation of L<DBI> module. B<[-u|-user]> - database user name B<[-p|-pass]> - database password B<[-qorg|-query_org]> - Organism name to which the query sequence belongs to, will be used to store the query record. =head1 OPTIONS B<[-h|-help]> - display this documentation. B<[-qtype|query_type]> - Sequence ontology(SO) cvterm that will be used for storing the query record. By default, it will be choosen from the type of blast search performed. The following map is being used by the program to decide that ..... =over =item blastn => nucleotide_match =item blastp or tblastn => protein_match =item blastx or tblastx => translated_protein_match =back B<[-qp|-query_parser]> - The parser that will be used to extract the query Id from the query blast header. Three parsers are available B<ncbi>, B<regular> and B<dicty>. By default, no parsing is performed. Here are the logic of the available parsers ... =over =item ncbi : The first Id that comes after gi. =item regular : It assumes there are at least 2 or more Ids present in the header separated by pipe(|) character. It returns the 2nd one. =item dicty : It is specific to header generated by dictyBase software. =back B<[-hp|-hit_parser]> - Works on blast header of the hit entry, works exactly like the query parser option. B<[-s|-src|-source]> - The source name that will be linked to every hit entry and ultimately can be used in the gbrowse configuration after the method name. It will be stored in accession of dbxref table linked via feature_dbxref table. By default, B<dictyBase_blast> is used. B<[-so|seq_onto]> - Sequence ontology namespace under which SO is loaded, default is B<sequence> B<[-update]> - Updates the alignments, here the loader tries to find the query in the database after parsing the header. If found, it deletes all Hit and HSPs that are linked to each of them. Then it creates new entires as it happens in case of a run without update flag. B<[-db_src|db_source]> - Name of the database authority to which every entry will be tied to in this blast loading. By default B<GFF_source> will be used. B<[-c|-config]> - Configuration in YAML format from where options will be read. Here is an example of it ..... B<[-v|-verbose]> - Show more output of every step, by default it goes to STDERR =over #The main datasource where the blast alignment will be stored database: dsn: "dbi:Oracle:host=yada;sid=yada" user: yada password: yada #The database from where the hit id will be looked up #Absolutely dictybase specific meta: dsn: "dbi:Oracle:host=yada;sid=yada" user: yada password: yada #Where the gene product name is stored #Absolutely dictybase specific legacy: dsn: "dbi:Oracle:host=yada;sid=yada" user: yada password: yada query: organism: dicty parser: dicty target: parser: none organism: dpur so: sequence source: dictyBase_blast database_source : GFF_source match_type: protein_match =back =head1 DESCRIPTION The blast data is loaded following the best practices of GMOD community, particular making it compatible with bulk GFF3 loader script. The following storage model is followed ... ------------------------------------------ genome ^ ^ ^ ^ | ___|____A_|___ | alignment feature type = match floc | ^ ^ | floc (rank = 0) | | f_r f_r | | --B----- ----C--- hsp feature type = match_part | | floc | | floc (rank = 1) V V ----D----- aligned feature type(protein/DNA/EST) =over =item * The query gets a feature record, its id gets parsed from its blast header. =item * Each hit gets a feature record along with its entry in analysisfeature table. In addition, the hit also adds a featureloc record tied to the genome. This is done to make it gbrowse compatible as the gbrowse-chado adaptor needs a featureloc entry for displaying. =item * The hsps gets a feature entry along with two featureloc entries both to genome and query feature and another feature relationship entry with its corresponding hit. =back =head2 OTHER FEATURES OF THE SCRIPT =over =item * No attempts is made to store the sequence of query. =item * The hit id is parsed from its header and then used to look up for reference feature(genome) in chado. If absent, that particular alignment is skipped. So, it is neccessary to use id of reference feature in the fasta file of target sequence. =item * As per chado database constraint, the unique id of each hit is constructed by combining both query and hit id. In gbrowse callback, the query id is parsed for display. =back =head1 DIAGNOSTICS Each insertion is done in its separate transaction. In case of failure, the scripts warns and moves on to the next alignment. =head1 CONFIGURATION AND ENVIRONMENT None. =head1 DEPENDENCIES Bio::Chado::Schema Bio::SearchIO Try::Tiny =head2 Optional dependencies[Depending on database server] DBD::mysql DBD::Pg DBD::Oracle =head1 BUGS AND LIMITATIONS It does not store any sequences. The HSP alignments is also not stored. =head1 AUTHOR I<Siddhartha Basu> B<siddhartha-basu@northwestern.edu> =head1 LICENCE AND COPYRIGHT Copyright (c) B<2009>, Siddhartha Basu C<<siddhartha-basu@northwestern.edu>>. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See L<perlartistic>.
Private
[
?
]
Run code