#!/usr/bin/perl

=head1 NAME 

ProcessBACoverlap_SSR.pl  

=head1 SYNOPSIS

ProcessBACoverlap_SSR.pl [-h[elp]] -R_chr {1..12} [-user {username}] [-p[ass] {PASSWD}] 

=head1 DESCRIPTION

Finds whether each set of SSR coordinates from DB is inside the overlap between two neighbor BACPACs and updates the database with such info.
It is meant to be run only when there is new data and the database needs updating.

=head1 ARGUMENTS


=cut


use strict;
use DBI;
use Getopt::Long;
#use Time::Local;

my $host   = '0.0.0.0';
my $dbname = 'EST_SSR';
my $dbuser = 'perlscript';
my $dbpass = 'perlscript';
my $help = undef;
my $rice_chr=undef;
my ($sth1,$sth2,$sth3);

# Tables involved: 
#  EST_SSR.rgp_gSSR,  RGP_assembly.bacpac_curated_ord, RGP_assembly.mblast_res_bacpac_vs_bacpac


# procedure: query SSR tables, ordered by rice chr and BACPAC_order, 
# save int data struct "main table"
#
# Create a second data struct that will hold overlaps to wanted records 
# from to the previous main table
#
# For each of the SSRs in the main table, find if its located in overlaps with previous BACPAC
# Save results back to database.
#
#NOTE : Accept all from chr 1 since they are already contiged.


#  data structure (those not mentioned will be discarded from the retrieved SQL result set)
my %table_columns=qw(
IRGP_gSSRid        1
BACPACacc          1
SeqID              1
SSRtype            1
motif              1
start              1
end                1
len                1
score              1
ordered_position   1
);

&GetOptions( 'host=s'    => \$host,
	    'dbname=s'  => \$dbname,
	    'user=s' => \$dbuser,
	    'p|pass=s' => \$dbpass,
		'R_chr=i' => \$rice_chr,
	    'h|help+' => \$help,
);
if(  defined $help  ) {
    system("perldoc $0");
    exit(0);
}
if( (defined $rice_chr) && (0 < $rice_chr) && ($rice_chr < 13)) {
	print STDERR "\n\tTable will extract info from rice chromosome $rice_chr only\n\n\n";
	
}else {
	warn "Warning,  '-R_chr  <?>' is required.  The argument for this tag is the rice chromosome number (1..12).\n\n";
	system("perldoc $0");
    exit(0);
}


## CONNECT TO DATABASE
# Hardcoded to use the mysql driver, change to suit needs...
my $dsn = "DBI:mysql:database=$dbname;host=$host";
my $dbh1 = DBI->connect($dsn,$dbuser,$dbpass,{RaiseError=>1}) or die "Could not connect, error was:\n\t$DBI::errstr\n\n";


## EXECUTE QUERY, RETRIEVE DATA (for table  1)
my $result_set_array_ref = undef;
print STDERR "Retrieve the whole set of SSRs from clones on this chromosome.\n Please wait (doing DB query)...\n";
$result_set_array_ref=&DoStandardQuery; # and arrayref of hashes if using %table_columns hash.
my $table1size= @$result_set_array_ref;


# get overlap details into table 2: a ref to a hash containing overlaps. Use BACPACacc as Key
print STDERR "Obtaining the list of ordered BACPACs in the tiling path...\n";
my $table_working_set_ref = &LabelOverlap_extras;  #



## WORK WITH DATA, comparing BPSSR coordinates with overlap coordinates
# write the coordinates and FLAG redundant SSRs inside overlap coordinates
#DB TABLE "Overlap_redundant" has fields IRGP_gSSRid, SeqID, InTilingPath , InOverlap

my $sth_main = $dbh1->prepare("INSERT INTO redundancy_check(IRGP_gSSRid, SeqID, InTilingPath , InOverlap) VALUES (?,?,?,?)");

# $inTilingPath should be set to 1 if retrieved by our query.  BACS not in tyling path should be avoided in future queries (Use SQL  "SELECT ... FROM redundancy_check WHERE InTilingPath == 1 AND InOverlap == 0" )
# Modify $inOverlap depending of table comparison
my ($InTilingPath , $InOverlap, $k) = ( 1 , 0 , 0); 

foreach my $line (@$result_set_array_ref) { # for each check gSSRid record in table 1 
		# Do the comparison,  SET $InOverlap=1 if this gSSR inside overlap region with previous bacpac
	$InOverlap = &CompareTables($line->{BACPACacc},$line->{start},$line->{end});
	$k++ if ($InOverlap==1) ;
	# last if ($k>9); # DEBUG:  does internal limit to 10 redundant (plus many non redundant) records for testing, delete after...
	# Write results back to SQL database

#	eval{
#		$sth_main->execute( $line->{IRGP_gSSRid}, $line->{SeqID}, $InTilingPath , $InOverlap );
#	}; 
#	die "\n\tBIG Problem!,  can't insert records into table. error is:\n $@ \n"  if ($@);

	$InOverlap=0;  # RESET
}
print STDERR "\n\nThere were $table1size gSSR records. Of these, $k were redundant (In overlaps). \n";

$sth_main->finish;

### DISCONNECT FROM DB
$dbh1->disconnect or warn "Disconnection from DB failed at end of script: $DBI::errstr\n";
print STDERR "\nDisconnnected from database...\n\n\n";


### PRINT THE TABLE
#&print_table;



#
##
####
###### END OF MAIN, SUBS FOLLOW
###
##
#

############################
sub CompareTables{# returns TRUE (1) if this gSSR coordinate is inside overlap with previous BAC

my $thisBACPACacc = shift;
my $SSRstart = shift;
my $SSRend = shift;

#print STDERR " About to compare tables with these values:  $thisBACPACacc, $SSRstart, $SSRend \n";

# If working with the megablast_summary table query, all overlap info is in single row ([0][0] to [0][4])
my $prevBACPACacc		= ${%$table_working_set_ref}{$thisBACPACacc}->[0][0];
if (!defined($prevBACPACacc)){
#	print STDERR "Response from function CompareTables(): This BACPAC has no overlap with previous BACPAC.\n";
	return 0 ;
}

my $prevBACleftcoord	= ${%$table_working_set_ref}{$thisBACPACacc}->[0][1];  
my $prevBACrightcoord	= ${%$table_working_set_ref}{$thisBACPACacc}->[0][2];
my $thisBACleftcoord	= ${%$table_working_set_ref}{$thisBACPACacc}->[0][3];
my $thisBACrightcoord	= ${%$table_working_set_ref}{$thisBACPACacc}->[0][4];

#print STDERR "Response from function CompareTables(): $thisBACPACacc overlap coord = [$thisBACleftcoord , $thisBACrightcoord] .\n";

# are the coordinates of the argument (The gSSR) within THIS_BACPAC/prev_BACPAC overlap coordinates?
#  NOTE:  megablast Subj_leftcoord always < Subj_rightcoord is ONLY guaranteed in megablast_summary table
#   otherwise need to check this assumption before comparing next..
if ( ($thisBACleftcoord <= $SSRstart) && ($SSRend <= $thisBACrightcoord) ) {
	# Return TRUE
#	print STDERR "Response from function CompareTables():  gSSR  IS  contained in overlap\n";
	return 1;
} else {
	# Return FALSE
#	print STDERR "Response from function CompareTables():  gSSR is NOT contained in overlap\n";
	return 0;
}

}
##### END SUB####################




#############################
### PRINT TABLE
### Traverse the data structure (see p.134 of "programming the perl DBI")
### (for each row in the array ref)
sub print_table{
	my $row_count = 0;
	foreach my $row (@$result_set_array_ref) {
	$row_count ++;
	##get the appropiate fields
	if ($row_count < 200) {
		print "$row_count\t$row->{BPSSRid}\t$row->{SeqID}\t$row->{chr}\t$row->{motif}\t$row->{len}\t$row->{score}\t$row->{ordered_position}\n";
#	print (join ("\t", @$row)); print "\n";   ## use this when the reference is an array of arrays, not array of hashes.
	}
	}
	print STDERR "\n\n\n Result set retrieved $row_count rows.\n";
}
##### END SUB #################



###############################
## STANDARD QUERY 


sub DoStandardQuery {


my $statement1= qq[
 select a.id as IRGP_gSSRid, b.BACPACacc,a.SSRtype,a.motif, a.start, a.end,a.len,a.score, a.SeqID, b.chr, b.ordered_position
 from EST_SSR.irgp_gSSR a
  inner join RGP_assembly.bacpac_curated_order b on SUBSTRING_INDEX(a.SeqID, '.', 1) = b.BACPACacc
 where (b.curated_ordering_version_id = 3 OR b.curated_ordering_version_id = 4)
 and b.chr=?
 order by b.chr ,b.ordered_position,a.start,a.end
];



## Let us look at the constructed statement
#print STDERR "The SQL statement being prepared is as follows:\n$statement1\n\n";

## EXECUTE QUERY
$sth1 = $dbh1->prepare($statement1);
 $sth1->bind_param( 1, $rice_chr );
$sth1->execute();

### RETRIEVE ALL RESULT SET IN SINGLE ARRAYREF of Hashes  (NOT recommended to very large results sets (like thousands of rows)) :
#The following data structure is described in "prog. the perl DBI" book at page 134 
#($ary_ref_1=a ref to an array that contains list of refs to hashes; one hash per row in SQLtable, with names of columns (i.e: BACPACacc) as the keys to retrieve values)
# ... I am passing a reference of the hash that stores wanted colums to the function fetchall_arrayref
my $main_array_ref= $sth1->fetchall_arrayref( \%table_columns ); # select which ones you want or leave empty if you want all columns

# OR retrieve reference as array of arrays instead of as array of hashes
#my $main_array_ref= $sth1->fetchall_arrayref( [1,2,4,15] );   

$sth1->finish;

return $main_array_ref;

}
##### END SUB####################



###### Deal with SSRs in Overlaps, write to STDOUT or to DB########
#Two (identical motif & similar in len,score) gSSRs are considered the same if their BACPACs
# are physically contigous ( ABS(ordered_positionA - ordered_positionB) = 1 ) 
# and their coordinates fall within the coordinates of the BAC overlaps
#
#In DB :  TABLE "Overlap_redundant" has fields IRGP_gSSRid1, SeqID1, IRGP_gSSRid2, SeqID2 
# Fore each Rice genome gSSR, keep its ID, BACPACacc,motif, ordered_position and find whether it is located in an overlap with neighbor
# need to perform more queries



#########
sub LabelOverlap_extras {  # returns the compiled lookup table with overlaps

my %compiled_overlap_table;  # a hash of refs to hashes (Key is BACacc, and references an array with overlpa info)
my $BACPAC_SSR_list_ref;
my $overlap_set_ref;
my $row;
my $thisBACPACacc;
my $previousBACPACacc;
my $i=0; #This row index
my $j; #Previous row index (define value in loop)
my $k=0;  # another counter
my $qBACleftcoord=undef;
my $qBACrightcoord=undef;
my $sBACleftcoord=undef;
my $sBACrightcoord=undef;

# Get list of ordered BACPACs containing SSRs from DB
$BACPAC_SSR_list_ref = GetOrderedListSSR_BACPACs($rice_chr); # and arrayref is returned.

print STDERR "\nlist of BACPACs compiled.\n";


#--- 
# prepare for later execution of SQL statement in 
# sub 'GetOverlap' and avoid preparing in loop

#my $statement3= qq[
#select a.Qacc,a.Sacc,a.Qstart,a.Qend,a.Sstart,a.Send, a.perc_identity, a.bit_score, a.aln_len
#        , b.BP_Seq_Len as Qseqlen, c.BP_Seq_Len as Sseqlen
#from RGP_assembly.mblast_res_bacpac_vs_bacpac a
#   inner join RGP_assembly.qchr b on a.Qacc=b.BACPACacc
#   inner join RGP_assembly.qchr c on a.Sacc=c.BACPACacc
#where  (Qacc =? AND Sacc =?)
#      AND E_val = '0.0' AND gap_openings < 50 AND mismatch < 30 and aln_len > 999
#order by Qacc,Qstart,Qend, bit_score desc
#];

##  Use Megablast_summary table instead.... All HSPs into one row is easier.
my $statement3= qq[
select a.Qacc,a.Sacc,a.QspanStart,a.QspanEnd,a.SspanStart,a.SspanEnd, a.gen_perc_identity
	, a.sum_bit_score, a.aln_len_sum, b.BP_Seq_Len as Qseqlen, c.BP_Seq_Len as Sseqlen
from RGP_assembly.megablast_summary a
   inner join RGP_assembly.qchr b on a.Qacc=b.BACPACacc
   inner join RGP_assembly.qchr c on a.Sacc=c.BACPACacc
where  ((Qacc = ? AND Sacc = ?) )
      and  aln_len_sum > 999
order by Qacc,QspanStart,QspanEnd
];


$sth3 = $dbh1->prepare($statement3);
#---


print STDERR "\nGoing through list and looking for overlaps...\n";


# Two loops; external is list of OrderedListSSR_BACPACs,
# internal is overlaps for each BACPAC pair, not implemented yet (returning 1 HSP only)

foreach $row (@$BACPAC_SSR_list_ref) {   # external LOOP

# Get a pair of contigous rows (this and the next) and extract BAC accessions  (This piece of code is working fine)
	$j=$i-1;
	$previousBACPACacc = $BACPAC_SSR_list_ref->[$j][1] ;  # second element in array corresponds to BACPACacc in table
	$thisBACPACacc     = $BACPAC_SSR_list_ref->[$i][1] ;
	$i++;  # increase now but don't use anymore..

#	print STDERR "The BACPACs selected are $previousBACPACacc and then $thisBACPACacc .\n";



# example of a pair of contigous and overlapping clones from chr 2: 
# overlap, Qacc ='AP005519' AND Sacc ='AP004049'
# Qstart = 2566 , Qend= 5166
# Sstart =  112345 ,  Send = 109745


#  Find overlap between this pair of BACPACs:  This is checking only FIRST row of overlaps for now

	$overlap_set_ref= &GetOverlap($previousBACPACacc,$thisBACPACacc); # a ref to ONE array of arrays (with "limit 1" in SQL)
	if (!defined($overlap_set_ref)){
#		print STDERR "... no overlap found between $previousBACPACacc and $thisBACPACacc. \n";
		next;
	}

#	historically I meant to correct for swap of Query|Subject when the table returned from blastDB, but now
#	this is implemented within the SQLquery by forcing it
#	(I expect to always find A->B if there is a B->A hit).  However I choose to leave code as is in case we return to old query
	if ($previousBACPACacc eq $overlap_set_ref->[0][0]) {  #If Prev BAC is the query in HSP (normal case)
		$qBACleftcoord=  $overlap_set_ref->[0][2];  
		$qBACrightcoord= $overlap_set_ref->[0][3];
		$sBACleftcoord=  $overlap_set_ref->[0][4];
		$sBACrightcoord= $overlap_set_ref->[0][5];
	}elsif ($thisBACPACacc eq $overlap_set_ref->[0][0]) {   #  query and subject were swapped, now Prev BAC is Subj.
		print STDERR " Query and Subject were swapped...\n";
		$sBACleftcoord=  $overlap_set_ref->[0][2];  
		$sBACrightcoord= $overlap_set_ref->[0][3];
		$qBACleftcoord=  $overlap_set_ref->[0][4];
		$qBACrightcoord= $overlap_set_ref->[0][5];
	}else{
		die "An error found when parsing overlaps between previous BAC $previousBACPACacc and current BAC $thisBACPACacc.  \n";
	}

	# SAVE COORDINATES OF THIS (MSP) OVERLAP TO TABLE
	# pushes whole array as one element into another anonymous array 
	# The key to search is the BACPACacc of the blast subject ("thisBACPACacc")

	# remember,  previousBACPACacc is treated as the query in the BLAST table.
	push (@{$compiled_overlap_table{$thisBACPACacc}}, [$previousBACPACacc,$qBACleftcoord,$qBACrightcoord,$sBACleftcoord,$sBACrightcoord] );

} # END outside LOOP

# close any remaining query handles
$sth3->finish;

return \%compiled_overlap_table;

}
########### END SUB ##########





#########  This SUB tested OK
### needs Chromosome num (scalar type) as argument;
sub GetOrderedListSSR_BACPACs {  # for Rchr 2-12 returns 2818 records on ref to array with SeqID, BACPACacc, chr, ordered_position (cmobines list of gSSRs and BAC order)

	my $Rchr = shift;  # obtain rice chrom number from argument

	return undef if ($Rchr==1);  # Rchr is based on pseudomols, so no overlaps exist.

	my $statement2= qq[
	select a.SeqID, b.BACPACacc, b.chr, b.ordered_position
	from EST_SSR.irgp_gSSR a
	  inner join RGP_assembly.bacpac_curated_order b
	  on SUBSTRING_INDEX(a.SeqID, '.', 1) = b.BACPACacc
	where b.curated_ordering_version_id = 4  and b.chr=?
	 group by SeqID
	order by b.chr ,b.ordered_position
	];

	## EXECUTE QUERY
	$sth2 = $dbh1->prepare($statement2);
	$sth2->bind_param( 1, $Rchr );
	$sth2->execute();
	my $array2_ref= $sth2->fetchall_arrayref(); # select which ones you want or leave empty if you want all columns
	$sth2->finish;
	return $array2_ref;
}
#######END SUB #####






########### This SUB tested OK ####
sub GetOverlap {  # Args: BAC1, BAC2. 
my ($BAC1,$BAC2) = @_;

## SQL for this is at caller (parent) function  LabelOverlap_Extras


## EXECUTE QUERY.  The PREPARE stmnt is outside the LOOP calling this function!!
$sth3->execute($BAC1,$BAC2);
my $overlapSet= $sth3->fetchall_arrayref(); # select all result set into a ref of an array of refs

# Return true (and write coords) if arguents are found as a match in mblast_res_bacpac_vs_bacpac table
if (!defined($overlapSet->[0])) { #return undef if result set is empty.
		return (undef);
}else{
#	my $Qseqlen = @$overlapSet[0]->[9] ; #len of query BAC
#	my $Sseqlen = $overlapSet->[0][10];  # another way to extract info, either way is fine
#	print STDERR "Query len is $Qseqlen , Subject len is $Sseqlen and overlap is $overlapSet->[0][8].\n";
	return $overlapSet;
}
}

########### END SUB ########



exit;

