#!/usr/bin/perl # Copyright (c) 1999-2000 Tugba Suzek and Steven L. Salzberg # Improvements and bug fixes due to Melanie Duffield and Carl Mayers, Aug 2001. # See the README file for instructions. # blastfasta.pl -f -b # type -f to include FASTA search # type -b to omit BLAST search # if you'd like comparative information about the running times # of blast and fasta, delete the comment character from the # lines containing "times" # $DBASE needs to contain the location of the amino acid database $DBASE="/usr/local/db/nr"; # Please set the directory names below to the directories # where your FASTA and BLAST reside $FASTA_DIRECTORY="/usr/local/bin/fasta"; $BLAST_DIRECTORY="/usr/local/bin/blast"; # can be any write permissible directory, default is the current directory $TEMP_FILE="/tmp/temp"; open SEQ,$ARGV[0]; open GLIMMER_O,$ARGV[1]; open OUT ,"> ".$ARGV[2]; $KEY=$ARGV[3]; # How many of the top hits do you want to be listed ? if ($KEY=~ /[^0-9]+/) { die "usage: blastfasta.pl -fb\n"; } #codons and corresponding amino acids $codon2aa{"ttt"}= "F"; $codon2aa{"ttc"}= "F"; $codon2aa{"tta"}="L"; $codon2aa{"ttg"}= "L"; $codon2aa{"tct"}= "S"; $codon2aa{"tcc"}= "S"; $codon2aa{"tca"}= "S"; $codon2aa{"tcg"}= "S"; $codon2aa{"tat"}= "Y"; $codon2aa{"tac"}= "Y"; $codon2aa{"tgt"}= "C"; $codon2aa{"tgc"}= "C"; $codon2aa{"ctt"}= "L"; $codon2aa{"ctc"}= "L"; $codon2aa{"cta"}= "L"; $codon2aa{"ctg"}= "L"; $codon2aa{"cct"}= "P"; $codon2aa{"ccc"}= "P"; $codon2aa{"cca"}= "P"; $codon2aa{"ccg"}= "P"; $codon2aa{"cat"}= "H"; $codon2aa{"cac"}= "H"; $codon2aa{"caa"}= "Q"; $codon2aa{"cag"}= "Q"; $codon2aa{"cgt"}= "R"; $codon2aa{"cgc"}= "R"; $codon2aa{"cga"}= "R"; $codon2aa{"cgg"}= "R"; $codon2aa{"att"}= "I"; $codon2aa{"atc"}= "I"; $codon2aa{"ata"}= "I"; $codon2aa{"atg"}= "M"; $codon2aa{"act"}= "T"; $codon2aa{"acc"}= "T"; $codon2aa{"aca"}= "T"; $codon2aa{"acg"}= "T"; $codon2aa{"aat"}= "N"; $codon2aa{"aac"}= "N"; $codon2aa{"aaa"}= "K"; $codon2aa{"aag"}= "K"; $codon2aa{"agt"}= "S"; $codon2aa{"agc"}= "S"; $codon2aa{"aga"}= "R"; $codon2aa{"agg"}= "R"; $codon2aa{"gtt"}= "V"; $codon2aa{"gtc"}= "V"; $codon2aa{"gta"}= "V"; $codon2aa{"gtg"}= "V"; $codon2aa{"gct"}= "A"; $codon2aa{"gcc"}= "A"; $codon2aa{"gca"}= "A"; $codon2aa{"gcg"}= "A"; $codon2aa{"gat"}= "D"; $codon2aa{"gac"}= "D"; $codon2aa{"gaa"}= "E"; $codon2aa{"gag"}= "E"; $codon2aa{"ggt"}= "G"; $codon2aa{"ggc"}= "G"; $codon2aa{"gga"}= "G"; $codon2aa{"ggg"}= "G"; $codon2aa{"tgg"}= "W"; $codon2aa{"tag"}= "*"; $codon2aa{"tga"}= "*"; $codon2aa{"taa"}= "*"; #reverse complements $comprev{"g"}= "c"; $comprev{"c"}= "g"; $comprev{"a"}= "t"; $comprev{"t"}= "a"; system "rm -rf times"; $callfasta=0; $callblast=1; if (@ARGV < 4) { die "usage: blastfasta.pl -fb\n"; } for($i=0;$i<=$#ARGV;$i++){ $option=$ARGV[$i]; if ($option=~ /\-f/){$callfasta=1;} elsif ($option=~ /\-b/){$callblast=0;} } if(!$callfasta && !$callblast){ print "Exit condition: Either fasta or blast should be selected\n"; exit; } $whole_seq=""; while(){ chomp; next if ($_=~ /[^agctAGCT]/); $whole_seq.=lc($_); } while(){ chomp; $line=$_; @temp=split(/\s+/,$line); if ($temp[2]>$temp[3]) { $start=$temp[3]; $end=$temp[2]; $mels=&complement_strand(substr($whole_seq,$start,$end-$start)); $cutlet=&seq2protein($mels); } if ($temp[3]>$temp[2]) { $start=$temp[2]; $end=$temp[3]; $cutlet=&seq2protein(substr($whole_seq,$start-1,$end-$start+1)); } $my_str="\n$line\n"; if ($callblast){$my_str.=("BLAST\n").(&call_blast($cutlet));} if ($callfasta){$my_str.=("FASTA\n").((&call_fasta($cutlet)));} print OUT $my_str,"\n"; } close(OUT); system "rm -rf $TEMP_FILE"; #main program ends here ############################################################### # SUB CALL_FASTA ############################################################### sub call_fasta { my $str1=shift; my $key=$KEY+1; open D ,"> $BLAST_DIRECTORY/db/seq"; print D &seq2fasta($str1); close D ; my $key=$KEY+1; system "$FASTA_DIRECTORY/fasta -Q -H -b $key -d $key $BLAST_DIRECTORY/db/seq $DBASE > $TEMP_FILE"; # system "grep \"CPU time\" $TEMP_FILE >> times"; open D ,"$TEMP_FILE"; while(){ if ($_=~ /^>>/){ my $line1=$_; while(){ $line1.=$_; } return $line1; } } } ############################################################### # SUB CALL_BLAST File E contains a list of all the protein sequences # in FASTA format ############################################################### sub call_blast { my $str1=shift; open D ,"> $BLAST_DIRECTORY/db/seq"; # open E ,">> $BLAST_DIRECTORY/db/fasta"; print D &seq2fasta($str1); # print E &seq2fasta($str1); close D ; # close E ; my $key=$KEY+1; system "$BLAST_DIRECTORY/blastp $DBASE $BLAST_DIRECTORY/db/seq -hspmax $key -warnings -errors > $TEMP_FILE"; # system "grep \"cpu time\" $TEMP_FILE >> times"; open D ,"$TEMP_FILE"; $count=0; while(){ if ($_=~ /^>/){ my $line1=$_; while(){ if ($_=~ /^>/){ $count++; if ($count > $key-1){ return $line1; } } $line1.=$_; } } } } ############################################################### # SUB SEQ@PROTEIN # converts the plain nucleotide sequence to amino acid sequence ############################################################### sub seq2protein { my $seq=shift @_; my @temp_array={}; @temp_array=split(//,$seq); my $aa_seq=""; for ($i=0; $i<$#temp_array; $i+=3){ my $codon=$temp_array[$i].$temp_array[$i+1].$temp_array[$i+2]; if (defined $codon2aa{$codon}){ $aa_seq.=$codon2aa{$codon}; } #undecidable nucleic acid else{ $aa_seq.="X"; } } return $aa_seq; } ############################################################### # SUB SEQ2FASTA # converts the plain sequence to FASTA format ############################################################### sub seq2fasta{ $seq=shift; $ret_seq=""; $ret_seq.='>'." $temp_array[0]\n"; for($i=0;$i