Monday 27 June 2011

Deparse an ABI file...

#!/usr/bin/perl -w

use strict;

my $stadenApp = "~/staden-linux-1-6-0/linux-bin/getABIfield";


for my $inputFile (@ARGV){
unless($inputFile =~ /\/(KN654-BAC(\d\d)-(\d\d)_([A-H]\d\d)-(FP|RP))\.ab[1i]$/){
warn "ignoring $inputFile\n";
next;
}

my $fileName = $1;
my $wellId = $4;
open APP, "-|", "$stadenApp -a -t $inputFile"
or die "failed to exec $stadenApp : $!\n";
my %data;
while(){
## Skip some 'data rows' that we can ignore for now.
next unless
/^$inputFile (\S{4}) (\d+) (.*)$/;
chomp;
$data{"$1 $2"} .= $3;
}
if (keys %data == 0){
die "wha? \n"
}
print
join("\t",
$fileName,
$data{"SMPL 1"}, # Trace name (unique id)
$data{"RunN 1"}, # Run name
$wellId,
$data{"TUBE 1"}, # Well id
$data{"GTyp 1"}, # Polymer name
$data{"MCHN 1"}, # Instrument name
$data{"MODL 1"}, # Instrument model
$data{"PDMF 1"}, # Mobility file
$data{"CMNT 1"}, # Comment
## Signal to noise ratio for the 4 dyes
split(/ /, $data{"S/N% 1"}),
## Times of the various 'events'
$data{"RUND 1"}. " ". $data{"RUNT 1"},
$data{"RUND 2"}. " ". $data{"RUNT 2"},
$data{"RUND 3"}. " ". $data{"RUNT 3"},
$data{"RUND 4"}. " ". $data{"RUNT 4"},
## More...
$data{"APrV 1"}, # Analysis protocol version number
$data{"CTNM 1"}, # Plate Name
$data{"CTID 1"}, # Plate barcode
$data{"ASPt 1"}, # Basecall start scan
$data{"AEPt 1"}, # Basecall stop scan
$data{"SPAC 1"}, # Base spaceing
), "\n";
}

here it is wrapped in Perl

#!/usr/bin/perl -w

use strict;

## Set up the Perl modules we need

## Read the command line options
use Getopt::Long;

## Read in the fasta sequence file
use Bio::SeqIO;

## Run the blast query
use LWP::UserAgent;
#use HTTP::Cookies;

## Parse the results
use HTML::Strip;
use Bio::SearchIO;






## Set up the net stuff

## Set up a 'web user agent' object (something like a browser).

my $ua = LWP::UserAgent->new;



## Configure the user agent:

## Response time out (in seconds)
$ua->timeout(10);

## How the user agent should handle cookies:
#$ua->cookie_jar({ file => "lwpcookies.txt", autosave => 1})



## Note the following URI is the form behind the login page:
## http://yh.genomics.org.cn/potato/login.jsp

my $loginUri = 'http://yh.genomics.org.cn/potato/check.jsp';



## Use the user agent to login to the webserver

my $login = $ua->
post( $loginUri, [ username => 'test', password => 'abc123' ]);

#warn $login->as_string, "\n";
#exit;



## Don't know why the above dosn't trigger writing a cookie to the
## cookie.jar! (So cookie code is commented out.)

## Grab the cookie manually

die "CANT FIND MY COOKIES!!!!\n"
unless $login->as_string =~ /^Set-Cookie: JSESSIONID=(\w*);/sm;

my $JSESSIONID = $1;



## Manuall set the cookie in the user agents header, so it will be
## used for all subsequent communication.

$ua->default_header('Set-Cookie' => $JSESSIONID);



## The following URI is the form behind the blast page:
## http://yh.genomics.org.cn/potato/search.jsp

my $blastUri = 'http://yh.genomics.org.cn:8099/blast.cgi';





## Now we are set to loop through the sequences in the fasta sequence
## file and blast each one!

## Process the command line

my $fastaFile;

GetOptions(
"fasta=s" => \$fastaFile,
)
or die "failed to parse command line options!\n";

die "please pass a fasta file to blast\n"
unless $fastaFile && -e $fastaFile;



## Process the fasta file

my $s = Bio::SeqIO->
new( -file => $fastaFile,
-format => 'fasta'
);

while(my $seq = $s->next_seq){
## Lets just dump the results!
my $outFile = 'Scratch/'. $seq->id. '.blast.html';
if(-s $outFile ){
warn "skipping, file exists : $outFile\n";
next;
}
warn "doing ". $seq->id. " (". $seq->length. ")\n";
#warn $seq->seq, "\n";
open(OUT, '>', $outFile)
or die "cant open $outFile\n";
## Blast it!
## Set any blast options you want here!
my $blast = $ua->
post( $blastUri, [ DATALIB => 'potato',
SEQUENCE => $seq->seq ]);
#warn $blast->as_string, "\n";
#exit;
print OUT $blast->as_string;
}




__END__

## below we already moved onto processing the results, which is too much for one script.
## Get the results (scalar) as a file handle for processing with
## SearchIO. Here we also strip out the HTML from the results.
my $string = HTML::Strip->new->parse( $blast->as_string );
open my $fh, '<', \$string
or die "FUCK!\n";

my $in =
Bio::SearchIO->new( -format => "blast", -fh => $fh );
while( my $result = $in->next_result ) {
while( my $hit = $result->next_hit ) {
while( my $hsp = $hit->next_hsp ) {

## Apply some basic filtering
if( $hsp->length('total') > 50 ) {
if ( $hsp->percent_identity >= 75 ) {
print
join("\t",
$hit->name(),
$hsp->percent_identity,
$hsp->length('total'),
# No. mismatches?
$hsp->percent_identity('total'),
# No. gaps
$hsp->gaps('total'),
$hsp->start('query'),
$hsp->end('query'),
$hsp->start('hit'),
$hsp->end('hit'),
$hsp->evalue(),
$hsp->score(),
), "\n";
}
}
}
}
}
last;
}

warn "OK\n";