#! /export/shared_apps/perl/bin/perl
use Bio::SeqIO;
use Bio::SearchIO;
use Bio::DB::GenPept;
use Bio::Structure::IO;
use Bio::SeqUtils;
use Bio::Seq;
use Bio::Tools::pSW;
use Bio::AlignIO;
use DBI;
use Class::Struct;
use CGI;
# Struct for creating the alignment map of the true physical positions, since the BLAST alignment does not
# conserve the given amino acid positions
struct Map => {
ncbiid => '$',
pdbid => '$',
pdbchain => '$',
ncbipos => '@',
pdbpos => '@',
};
# Hash for matching amino acids, and its reverse hash
my %aa1= qw (ALA A ARG R ASN N ASP D CYS C GLN Q GLU E GLY G HIS H ILE I LEU L LYS K MET M PHE F PRO P SER S THR T TRP W TYR Y VAL V);
my %aa2=reverse (%aa1);
# function to perform the actual BLAST alignments to find matches for the protein sequences taken from MutDB
sub parseBlastFile {
my $blastoutfname = $_[0];
my $pos = $_[1];
# Open blast output file
my $blastres = new Bio::SearchIO(-format => 'blast',
-file => $blastoutfname);
# Now get identical hits store as an array containing a list of alternating names and $hsp data
# @hits = ( $name1, $hsp1, $name2, $hsp2, .... ) $hspi == a SearchIO hsp object
# blast hits
my @hits = ();
# cycle through all of the results
while ( my $res = $blastres->next_result ) {
# cycle through all of the hits within the result
while( my $hit = $res->next_hit ) {
# cycle through each hsp object
while ( my $hsp = $hit->next_hsp ) {
# if the identity percentage between the query sequence and the resulting match is 100%, continue
if ($hsp->frac_identical==1.0){
# if the amino acid position of the mutation is within the sequence that matches between
# the original query and the result, continue
if($hsp->start('query') <= $pos && $hsp->end('query') >= $pos) {
print $hit->name, " ";
print $hsp->frac_identical(), " ";
print OUTFILE $hit->name, " ";
print OUTFILE $hsp->frac_identical(), " ";
print OUTFILE "\n";
print "\n";
# if the total length of the matched sequence section is more than 15 amino acids, keep it
# all of these special cases eliminates short strings of repeated characters and other such
# matches
if($hsp->length('total') > 15){
push( @hits, $hit->name, $hsp );
}
}
}
}
}
}
# return all of the results found, if any
return @hits;
}
###################################################################################################################################################################################
# end OF SUB routine and beginning of main block #
###################################################################################################################################################################################
# now we start the main bulk of the script
# open the output logfile
$error = open(OUTFILE, ">blastStatus.txt") or die "Can't create smdbsnplog.txt: $!";
my $blastfile = $ARGV[0];
my $pos = $ARGV[1];
@hits = parseBlastFile( $blastfile, $pos);
# get the number of results
my $size = scalar @hits;
print "\nblast results: ", $size, "\n";
print OUTFILE "Sequence: ", $seq_str, "\n";
print OUTFILE "blast results: ", $size, "\n";
# if we got any blast results, continue
if ($size!=0) {
# the total number of pdb records kept
my $pdbcnt = 0;
# cycle through the blast results
for (my $x=0; $x<@hits; $x+=2) {
# keep the hit
my $hsphit = $hits[$x+1];
# save the middle two letters of the PDB structure identifier
#my $pdbslice = substr($hits[$x], 1, 2);
my @hits_substr = split('|',$hits[$x]);
print "\n K1000 Complete hit variable: $hits[$x]";
my $pdbslice = lc($hits_substr[5].$hits_substr[6]);
print "\nK1001 Save the middle two letters of PDB structure identifier: $pdbslice";
# save the entire PDB structure identifier
my $pdbid = lc($hits_substr[4].$hits_substr[5].$hits_substr[6].$hits_substr[7]);
print "\nK1002 Save the entire PDB structure identifier: $pdbid";
}
}