#!/usr/bin/perl

=head1 NAME 

blast_result_HSP_rep_check.pl  

=head1 SYNOPSIS

write this...
Make sure that you include a "id" auto-incremented primary key to identify each HSP row.

For Instance, with the SQL: 

alter table blast_res_hvgissr_bacpacpseudo
 add column  HSP_id int unsigned not null auto_increment after bit_score
,  add primary key (HSP_id)
, type=MyISAM



Include also a new field (column) to hold Repeated_HSP info:  

alter table blast_res_hvgissr_bacpacpseudo 
 add column Repeated_HSP tinyint(1) unsigned not null default '0' after bit_score
, type=MyISAM


These need to be updated in the database prior to running this script


=head1 DESCRIPTION

write this 

=head1 ARGUMENTS

-host
-user
-p[ass]
-dbname
-help

=cut


use strict;
use DBI;
use Getopt::Long;


my $host   = '0.0.0.0';  # Set the IP address of your mysql server here.
my $dbname = 'est_ssr';   # The database with the blast_results_table to check.
my $tab_name = 'blast_res_osgi_chr1pseudo';  # The table to check.
my $dbuser = undef;
my $dbpass = undef;
#my $dbuser = 'perlscript';
#my $dbpass = 'perlscript';
my $sth = undef;
my $help = undef;


# The columns to keep from the SQL query

my %table_columns=qw(
Qacc				1
Sacc				1
);

my %table2_columns=qw(
Qacc				1
Sacc				1
Qstart				1
Qend				1
Sstart				1
Send				1
E_val				1
bit_score			1
Repeated_HSP		1
HSP_id				1
);


my $best_HSP_ref=undef;


&GetOptions( 'host=s'    => \$host,
	    'dbname=s'  => \$dbname,
	    'user=s' => \$dbuser,
	    'p|pass=s' => \$dbpass,
	    'h|help+' => \$help,
);
if(  defined $help  ) {
    system("perldoc $0");
    exit(0);
}if (!defined $dbuser) {
	warn "Please provide a username with access to database (such as guest)\n\n";
	system("perldoc $0");
    exit(0);
}



my %mockup_table_results=(Qacc => "TC96141", Sacc => "chr1_ctg1");
#my %mockup_table_results=(Qacc => "AL820063", Sacc => "AL606692");

print STDOUT  "\n  Will be updating the table $dbname.$tab_name \n\n";


#### need to connect to database and retrieve HSPs info 
#### for each query::subject pair

my $dsn = "DBI:mysql:database=$dbname;host=$host";
my $dbh = DBI->connect($dsn,$dbuser,$dbpass,{RaiseError=>1}) or die "Could not connect, error was:\n\t$DBI::errstr\n\n";

# This Retrieves BLAST list of Query::Subject pairs info from db
my $QuerySubj_Pairs_array_ref = &QueryDB;  

# OR use mockup for test...

# The array ref is equal to and anonymous array  '[]' that contains in its first element the ref to a hash
#my $QuerySubj_Pairs_array_ref = [\%mockup_table_results];  

my $HSPs_array_ref= undef;

# Prepare the statement to retrieve the HSPs for current Query::Subj pair
my $getTheseHSPs = qq[
SELECT Qacc, Sacc, Qstart, Qend, Sstart, Send, E_val, bit_score, Repeated_HSP, HSP_id
FROM   $dbname.$tab_name
WHERE  Qacc = ?
  AND  Sacc = ?
ORDER BY Qstart, Qend
#limit 5
];

my $sth_getHSPs = $dbh->prepare($getTheseHSPs);



# Prepare the query statement to update database, before getting into loop
my $updateSQLstr = qq[
UPDATE $dbname.$tab_name SET Repeated_HSP=1
WHERE HSP_id=?
];

my $sth_updateBLASTtable = $dbh->prepare($updateSQLstr);


# print "The contents of the retrieved table are: \n";

my $records =1;
my $previous_HSPref=undef;
my $flag_as_bad = undef;

# BIG LOOP
foreach my $QuerySubjPair (@{$QuerySubj_Pairs_array_ref}) {
	print STDERR "\n================\n==  New Query :: Subject pair to compare.\n================\n";

  # Obtain the HSPs
	eval{
	$sth_getHSPs->execute($QuerySubjPair->{Qacc} , $QuerySubjPair->{Sacc});
	};  die "\n\tBIG Problem!,  can't execute query. error is:\n $@ \n"  if ($@);

	$HSPs_array_ref = $sth_getHSPs->fetchall_arrayref(\%table2_columns);

	$best_HSP_ref = $previous_HSPref = ${$HSPs_array_ref}[0];  # store the first one for now
	
	print STDERR "\tQacc    \tSacc\tQstart\tQend\tbit_score\tHSP_id\tRepeated_HSP\n"; 

  # while in the same QUERY::SUBJ pairs
	foreach my $HSProw (@$HSPs_array_ref) { # for each HSP record in QuerySubj Pairs table
		print STDERR "\t$HSProw->{Qacc}\t$HSProw->{Sacc}\t$HSProw->{Qstart}\t$HSProw->{Qend}\t$HSProw->{bit_score}\t$HSProw->{HSP_id}\n";  #if object is returned as array of hashes

		# Test for the adjacency of this HSP to previous (records are expected to be sorted by Qstart,  Implement this in the SQL above  )
		#  If overlapped,  Is this one better than previous?

		if (&OverlapsPrevious($best_HSP_ref,$HSProw)) {  # table required to be sorted left to right for this to work
			#print STDERR "Overlap found with best HSP so far based on score...  \n";
			
			# Compare statisitics for this HSP with best so far, 
			# set flag to 1  if HSP overlpas another one that has better bitscore
			($best_HSP_ref,$flag_as_bad) = &StoreBestHSP($best_HSP_ref,$HSProw);

			&UpdateTable($flag_as_bad);

			$previous_HSPref = $HSProw;

		}
		else {   #  IF not overlap to previous(the HSP to the left of this one), set this one as best score so far
			$previous_HSPref = $best_HSP_ref = $HSProw;
			# print STDERR "   New BEST non-overlapping HSP : $best_HSP_ref->{HSP_id} . \n ";
		}
		$records++;
	}


} # END BIG LOOP

# Finish statements
$sth_updateBLASTtable->finish;
$sth_getHSPs ->finish;



### DISCONNECT FROM DB
$dbh->disconnect or warn "Disconnection from DB failed at end of script: $DBI::errstr\n";

$records=$records-1;

print "\nThe number of HSPs processed was $records.    bye\n";


###################### END OF MAIN ################






#####
#####

sub QueryDB {

	# Design Query
	my $statement=qq[
	SELECT Qacc, Sacc
	FROM $dbname.$tab_name
	GROUP by Qacc, Sacc
	ORDER by HSP_id
	#LIMIT 5
	];

	# print STDERR "First SQL Statement is:\n$statement\n";


	$sth = $dbh->prepare($statement);

	# Execute query.
	eval{
	$sth->execute(); 
	};
		die "\n\tBIG Problem!,  can't execute query. error is:\n $@ \n"  if ($@);



	my $array_ref= $sth->fetchall_arrayref(\%table_columns); # only the selected colums, as array of hashes 
	# if left empty, gets all columns
	# my $array_ref= $sth->fetchall_arrayref()

	# OR select colums by number, returning ref to array of arrays ...
	#my $array_ref= $sth->fetchall_arrayref( [0,1,4,2,3,9] );


	my $table1size= @$array_ref;
	print STDERR "The number of query|subject pairs in result set is $table1size\n\n";


	$sth->finish;
	return $array_ref;
}
####
###### end sub
####







#####
#####

sub OverlapsPrevious {  # Based on MSPcrunch2 algorithms

	my ($PreviousHSP,$ThisHSP) = @_;

	if ($ThisHSP->{HSP_id} == $PreviousHSP->{HSP_id}) {
		#print STDERR "Warning, If you want met to compare the overlap of one HSP with itself...I say there is no overlap. \n";
		return 0;
	}


	####_____________________________________	#   SS1  SE1    SS2   SE2
	my $ThisQstart    = $ThisHSP->{Qstart};		#  --=====------=======---
	my $PreviousQend  = $PreviousHSP->{Qend};	#    |||||     ///////   QueryGap = QS2-QE1-1
	my $ThisSstart    = $ThisHSP->{Sstart};		#  --=====----=======---
	my $PreviousSend  = $PreviousHSP->{Send};	#	QS1  QE1 QS2   QE2 

	# correction to returned subject coordinates:
	# (if in other strand orientation (-),   coordinates are backwards)
	#  To fix,  use the absolute value of difference


	my $QueryGap = ( $ThisQstart - $PreviousQend -1);  
	my $SbjGap   = (abs($ThisSstart - $PreviousSend) -1); 

	# return the greater of two general scalars : ($a,$b)[$a<$b]

	# We want the minimum, so invert the sign
	my  $HSP_dist = (($QueryGap,$SbjGap)[$QueryGap>$SbjGap]);
	#print STDERR "HSPdist is $HSP_dist  ,  QueryGap is $QueryGap and SubjGap is $SbjGap  \n";

	#  IF $HSPdist is negative (but we accept up to 20 bases overlap),  the two HSPs are overlapping,
	#  This obviously means than one of the two (Query or SUbject of the two HSPs) has a repeated region
	#  for which we are detecting multiple hits.

	if ( ($HSP_dist < -20) ) {
		# Return TRUE
		print STDERR "YES, is in overlap (HSPdist = $HSP_dist)...     ";
		return 1;
	} else {
		# Return FALSE
		print STDERR "NOT in overlap\n";
		return 0;
	}


}


#####
#######  end sub
#####









#####
#####

sub StoreBestHSP {

# called like:  ($best_HSP_ref,$flag_as_bad)= &StoreBestHSP($best_HSP_ref,$HSProw);

# returns the High score and the low score HSPs

#	print "\n$HSProw->{Qacc}\t$HSProw->{Sacc}\t$HSProw->{bit_score}\t$HSProw->{HSP_id}\n";  #if object is returned as array of hashes

	my $bestHSPref         = shift;
	my $currentHSPref      = shift;

	#my $bestHSP_QSpair     =  $bestHSPref->{Qacc}."_".$bestHSPref->{Sacc};
	#my $currentHSP_QSpair  = $currentHSPref->{Qacc}."_".$currentHSPref->{Sacc};

	if($currentHSPref->{bit_score} > $bestHSPref->{bit_score}) {
		#print STDERR  "NEW BEST HSP, this HSP overlaps previous best, but has better score. \n";
		return ($currentHSPref,$bestHSPref);
	}
	else {
		#print STDERR  "this HSP not as good as previous one.\n";
		return ($bestHSPref,$currentHSPref);   #  else,  the older is still the best
	}

}
###### end sub
####





#####
#####
sub UpdateTable {

my $targetHSP = shift;
my $HSPnum = $targetHSP->{HSP_id};

# called as UpdateTable$flag_as_bad);
	print  "Updating table for HSP $HSPnum.\n";


eval{
	$sth_updateBLASTtable->execute($HSPnum);
#	print STDOUT "Please uncomment lines in UpdateTable function if you want to save to DB\n";
}; 	die "\n\tBIG Problem!,  can't execute query. error is:\n $@ \n"  if ($@);


return;
}
####
#######  END sub
####


