#!/usr/bin/perl -w
use strict;

################################################################
## This script creates a report about a set of analyzed regulons
##
## Based on original in https://www.ncbi.nlm.nih.gov/pubmed/27557774
## Bruno Contreras Moreira, Ruben Sancho 2018

## parameters
my $RNDSAMPLES =50;
my $MAXSIG     =10; #20
#my $MAXSIGMEME =50;
my $MAXSIGGO   =60;
my $MINCOR     =0.7;
my $MINNCOR    =0.5; 

my $CEX        =2;
my $RIMGWIDTH  =400;
my $REPIMGWIDTH=100;

my $FOOTDBURL  = "http://floresta.eead.csic.es/footprintdb/index.php?motif=";


# input folders, one per regulon, previously processed with make -f regulons.mk
my @REGULONS= qw( D9 D15 D22 D30 D33 ); 
@REGULONS = qw( D1 D2 D3 D4 D5 D6 D7 D8 D9 );
@REGULONS = qw( W1 W2 W3 W4 W5 W6 W7 W8 W9 W10 W11 W12 W13 );

#my $ORTHSPECIES = 'Hordeum_vulgare.IBSCv2.37';
#my $EXPRDIR = 'expression_plots/';

my @sourcefiles = qw( README.txt report.pl go.mk peak-regulons.mk regulons.mk );
my @sourcedirs  = @REGULONS;

my $RVERBOSE = 0;
my $Rparams = '';
if(!$RVERBOSE){ $Rparams = '-q 2>&1 > /dev/null' }

my $outdir = $ARGV[0] || die "# usage: $0 <output folder>\n";

mkdir($outdir) if(! -e $outdir);

print "# output folder : $outdir\n";
#print "# EXPRDIR = $EXPRDIR\n";
#print "# ORTHSPECIES=$ORTHSPECIES\n";
print "# RNDSAMPLES=$RNDSAMPLES MAXSIGGO=$MAXSIGGO MAXSIG=$MAXSIG MINCOR=$MINCOR MINNCOR=$MINNCOR\n";

open(REPORT,'>',$outdir.'/report.tab');

#print REPORT "# regulon\tpromoters\tGO enrichment\tMEME\tMEME-logo\tMEME-orth\tMEME-footDB\tpeaks-oligo\tpeaks-dyad\tpeaks-logo\tpeaks-orth\tpeaks-footDB\n";
#print REPORT "# cluster\tGO enrichment\tMEME\tMEME-orth\tMEME-footDB\tpeaks-oligo\tpeaks-dyad\tpeaks-orth\tpeaks-footDB\n";
print REPORT "# name\tGO enrichment\tpeaks-oligo\tpeaks-dyad\tfootprintDB\n";


foreach my $reg (@REGULONS)
{
    ##1) GO analysis

    ##3) peak-motifs
    ##3.1) oligo sig of regulon vs random 
    ##3.2) dyad sig or regulon vs random 
    ##3.3) sig of motif vs footDB 

    print "\n# Making report of $reg\n";

    # print cluster name, contents and plot of expression profile
    print REPORT "<b>cluster $reg</b><br>";
    printf(REPORT "sequences: %d",(split(/\s+/,`wc -l $reg/regulon$reg.txt`))[0]);

	### GO analysis
	my $orig_report = "$reg/regulon$reg.go.tab";
    system("cp $orig_report $outdir/");

	my $sorted_report = "$outdir/regulon$reg.go.report.tab";
    my $report = "$outdir/regulon$reg.go.report.raw.tab";
	open(GOREPT,">",$report);
	
    my ($best_sig,$best_GO,$best_annot,$formatted_annot) = (0,'NA','','');
    open(GO,$orig_report) || warn "# cannot find $orig_report";
    while(<GO>)
    {
        #GO:0043530 GO:0043530 adenosine... 1 1 1 1 00 24111 1.00000 4.1e-05 0.27327 0.56 1
        next if(/^#/);
        my @data = split(/\t/,$_);
        $best_GO = $data[0];

        my @words = split(/\s+/,$data[2]);
        if(scalar(@words) > 5)
        {
            $formatted_annot = join(' ',@words[0 .. 2]).'..';
        }
        else{ $formatted_annot = $data[2] }
        

        $best_annot = "<a href=\"http://amigo.geneontology.org/amigo/term/$best_GO\">$best_GO</a><br>" .
            $formatted_annot ." (<a href=\"regulon$reg.go.tab\">full list</a>) ";
        $best_sig = $data[13];
        if($best_sig > $MAXSIGGO){ $best_sig = $MAXSIGGO }
	    last;
    }
    close(GO);

    print GOREPT "$reg\t$best_sig\t$best_GO\n";

    foreach my $rnd (1 .. $RNDSAMPLES)
    {
        ($best_sig,$best_GO) = (0,'NA');
        open(GO,"$reg/random$reg-$rnd.go.tab") || warn "# cannot find $reg/random$reg-$rnd.go.tab";
        while(<GO>)
        {
            next if(/^#/);
            my @data = split(/\t/,$_);
            $best_GO = $data[0];
            $best_sig = $data[13];
            if($best_sig > $MAXSIGGO){ $best_sig = $MAXSIGGO }
            last;
        }
        close(GO);
        
        print GOREPT "rnd$rnd\t$best_sig\t$best_GO\n";
    }

    close(GOREPT);

    system("sort -s -k2,2nr $report > $sorted_report");
    my $regulon_idx = -1;
    open(SORTREPT,$sorted_report);
    while(<SORTREPT>)
    {
        $regulon_idx++;
        next if(/^rnd-\d+/);
        last;
    }
    close(SORTREPT);

    my $colors = get_Rplot_colors($regulon_idx,$RNDSAMPLES+1);

    open(RSHELL,"|R --no-save $Rparams ") || die "# $0 : cannot call R: $!\n";
    print RSHELL<<EOR;
    png("$outdir/regulon$reg.go.report.png",width=$RIMGWIDTH);
    colors = c($colors)
    godata = read.table(file="$sorted_report",header=F,check.names=F);
    barplot( godata\$V2, names.arg=godata\$V1, col=colors, ylab="max significance", ylim = c(0,$MAXSIGGO), cex.lab=$CEX, cex.axis=$CEX)
    legend("topright", inset=.05, c("$reg cluster","$RNDSAMPLES random clusters"),
        fill=c('black','grey'), cex=$CEX)
    dev.off()
    q()
EOR
    close RSHELL;

    print REPORT "\t<a href='./regulon$reg.go.report.png'><img width='$REPIMGWIDTH' src='./regulon$reg.go.report.png'></a>";
    print REPORT "<br>$best_annot";

    ## peak-motifs analysis
    $sorted_report = "$outdir/regulon$reg.peaks.report.tab";
    $report = "$outdir/regulon$reg.peaks.report.raw.tab";
    open(PEAKREPT,">",$report);
    
    ($best_sig) = (0);
    my $best_oligo = '';
    my $file = "$reg/regulon$reg.rm.fna.peaks-rm/results/oligos_5-7nt/peaks_oligos-2str-noov_5-7nt_mvk2_2.tab";
    open(PEAK,$file) || warn "# cannot find $file\n";
    while(<PEAK>)
    {
        #seq    identifier      exp_freq        occ     exp_occ occ_P   occ_E   occ_sig rank    ovl_occ forbocc
        #caggcag caggcag|ctgcctg 0.0002062705470 13      3.37    5.2e-05 4.2e-01 0.37    1       1       78
        next if(/^[;#]/);
        my @data = split(/\t/,$_);    
        if($data[7] > $best_sig)
        { 
            $best_sig = $data[7]; 
            if($best_sig > $MAXSIG){ $best_sig = $MAXSIG } 
            $best_oligo = $data[1]; 
        }
    } 
    close(PEAK);
    
    my $best_sig_dyad = 0;
    my $best_dyad = '';
    $file = "$reg/regulon$reg.rm.fna.peaks-rm/results/dyads_test_vs_ctrl/peaks_dyads-2str-noov_3nt_sp0-20_bg_monads.tab";
    open(PEAKD,$file) || warn "# cannot find $file\n";
    while(<PEAKD>)
    {
        next if(/^[;#]/);
        my @data = split(/\t/,$_);
        if($data[7] > $best_sig_dyad)
        { 
            $best_sig_dyad = $data[7]; 
            if($best_sig_dyad > $MAXSIG){ $best_sig_dyad = $MAXSIG } 
            $best_dyad = $data[1];
        }
    }
    close(PEAKD);

    print PEAKREPT "$reg\t$best_sig\t$best_sig_dyad\n";

    foreach my $rnd (1 .. $RNDSAMPLES)
    {
        ($best_sig) = (0);
        $file = "$reg/random$reg-$rnd.rm.fna.peaks-rm/results/oligos_5-7nt/peaks_oligos-2str-noov_5-7nt_mvk2_2.tab";
        open(PEAK,$file) || warn "# cannot find $file";
        while(<PEAK>)
        {
            next if(/^[;#]/);
            my @data = split(/\t/,$_);
            if($data[7] > $best_sig){ $best_sig = $data[7]; if($best_sig > $MAXSIG){ $best_sig = $MAXSIG } }
        }
        close(PEAK);

        $best_sig_dyad = 0;
        $file = "$reg/random$reg-$rnd.rm.fna.peaks-rm/results/dyads_test_vs_ctrl/peaks_dyads-2str-noov_3nt_sp0-20_bg_monads.tab";
        open(PEAKD,$file) || warn "# cannot find $file\n";
        while(<PEAKD>)
        {
            next if(/^[;#]/);
            my @data = split(/\t/,$_);
            if($data[7] > $best_sig_dyad){ $best_sig_dyad = $data[7]; if($best_sig_dyad > $MAXSIG){ $best_sig_dyad = $MAXSIG } }
        }
        close(PEAKD);

        print PEAKREPT "rnd-$rnd\t$best_sig\t$best_sig_dyad\n";
    }

    close(PEAKREPT);

    system("sort -s -k2,2nr $report > $sorted_report");
    $regulon_idx = -1;
    open(SORTREPT,$sorted_report);
    while(<SORTREPT>)
    {
        $regulon_idx++;
        next if(/^rnd-\d+/);
        last;
    }
    close(SORTREPT);

    $colors = get_Rplot_colors($regulon_idx,$RNDSAMPLES+1);

    open(RSHELL,"|R --no-save $Rparams ") || die "# $0 : cannot call R: $!\n";
    print RSHELL<<EOR;
    png("$outdir/regulon$reg.peaks.oligos.report.png",width=$RIMGWIDTH);
    colors = c($colors)
    pdata = read.table(file="$outdir/regulon$reg.peaks.report.tab",header=F,check.names=F);
    barplot( pdata\$V2, names.arg=pdata\$V1, col=colors, ylab="max significance", ylim = c(0,$MAXSIG), cex.lab=$CEX, cex.axis=$CEX)
    legend("topright", inset=.05, c("$reg cluster","$RNDSAMPLES random clusters"),
        fill=c('black','grey'), cex=$CEX)
    dev.off()
    q()
EOR
    close RSHELL;

    print REPORT "\t<a href='./regulon$reg.peaks.oligos.report.png'><img width='$REPIMGWIDTH' ".
        "src='./regulon$reg.peaks.oligos.report.png'></a>";
    print REPORT "<br>$best_oligo";

    system("sort -s -k3,3nr $report > $sorted_report");
    $regulon_idx = -1;
    open(SORTREPT,$sorted_report);
    while(<SORTREPT>)
    {
        $regulon_idx++;
        next if(/^rnd-\d+/);
        last;
    }
    close(SORTREPT);

    $colors = get_Rplot_colors($regulon_idx,$RNDSAMPLES+1);

    open(RSHELL,"|R --no-save $Rparams ") || die "# $0 : cannot call R: $!\n";
    print RSHELL<<EOR;
    png("$outdir/regulon$reg.peaks.dyads.report.png",width=$RIMGWIDTH);
    colors = c($colors)
    pdata = read.table(file="$outdir/regulon$reg.peaks.report.tab",header=F,check.names=F);
    barplot( pdata\$V3, names.arg=pdata\$V1, col=colors, ylab="max significance", ylim = c(0,$MAXSIG), cex.lab=$CEX, cex.axis=$CEX)
    legend("topright", inset=.05, c("$reg cluster","$RNDSAMPLES random clusters"),
        fill=c('black','grey'), cex=$CEX)
    dev.off()
    q()
EOR
    close RSHELL;

    #print REPORT "\t<a href='./regulon$reg.peaks.dyads.report.png'><img width='$REPIMGWIDTH' src='./regulon$reg.peaks.dyads.report.png'></a>\t";
    print REPORT "\t<a href='./regulon$reg.peaks.dyads.report.png'><img width='$REPIMGWIDTH' ".
        "src='./regulon$reg.peaks.dyads.report.png'></a>";
    print REPORT "<br>$best_dyad";

    opendir(PEAKSDIR,"$reg/regulon$reg.rm.fna.peaks-rm/results/discovered_motifs");
    my $peaklogos = 0;
    foreach my $motifdir (grep {/_m\d+$/} readdir(PEAKSDIR))
    {
        $peaklogos++;
        #my $logofile = "$reg/regulon$reg.rm.fna.peaks-rm/results/discovered_motifs/$motifdir/peaks_$motifdir\_logo.png";
        #system("cp $logofile $outdir/$reg.peaks.logo$peaklogos\.png");
        #print REPORT "<a href='./$reg.peaks.logo$peaklogos\.png'><img width='$REPIMGWIDTH' src='./$reg.peaks.logo$peaklogos\.png'></a><br>";
    }
    close(PEAKSDIR);

    ## peak-motifs footDB scan
    $sorted_report = "$outdir/regulon$reg.peaks.footDB.report.tab";
    $report = "$outdir/regulon$reg.peaks.footDB.report.raw.tab";
    open(PEAKFOOTDB,">",$report);

    my $best_Ncor = 0;
    $file = "$reg/regulon$reg.rm.fna.peaks-rm/results/discovered_vs_db/peaks_motifs_vs_db_footDB.tab";
    open(PEAKMOTIF,$file) || warn "# cannot find $file\n";
    while(<PEAKMOTIF>)
    {
        #id1    id2 name1   name2   cor Ncor    logoDP  NIcor   NSW SSD NsEucl  w1  w2  w   W   Wr  wr1 wr2 
        #oligos_5-7nt_m2    2012.1_    oligos_5-7nt_m2 TRANSFAC_2012.1_    0.965   0.965   8.843   0.978   0.983   0.3368  
        next if(/^[;#]/);
        my @data = split(/\t/,$_);
        next if($data[4] < $MINCOR || $data[5] < $MINNCOR);
        if($data[5] > $best_Ncor){ $best_Ncor = $data[5] }
    }
    close(PEAKMOTIF);
    print PEAKFOOTDB "$reg\t$best_Ncor\n";

    foreach my $rnd (1 .. $RNDSAMPLES)
    {
        $best_Ncor = 0;   
        $file = "$reg/random$reg-$rnd.rm.fna.peaks-rm/results/discovered_vs_db/peaks_motifs_vs_db_footDB.tab";
        open(PEAKMOTIF,$file) || warn "# cannot find $file\n";
        while(<PEAKMOTIF>)
        {
            next if(/^[;#]/);
            my @data = split(/\t/,$_);
            next if($data[4] < $MINCOR || $data[5] < $MINNCOR);
            if($data[5] > $best_Ncor){ $best_Ncor = $data[5] }
        }
        close(PEAKMOTIF);
        print PEAKFOOTDB "rnd-$rnd\t$best_Ncor\n";
    }

    close(PEAKFOOTDB);

    system("sort -s -k2,2nr $report > $sorted_report");
    $regulon_idx = -1;
    open(SORTREPT,$sorted_report);
    while(<SORTREPT>)
    {
        $regulon_idx++;
        next if(/^rnd-\d+/);
        last;
    }
    close(SORTREPT);

    $colors = get_Rplot_colors($regulon_idx,$RNDSAMPLES+1);    
   
    open(RSHELL,"|R --no-save $Rparams ") || die "# $0 : cannot call R: $!\n";
    print RSHELL<<EOR;
    png("$outdir/regulon$reg.peaks.footDB.report.png",width=$RIMGWIDTH);
    colors = c($colors)
    data = read.table(file="$sorted_report",header=F,check.names=F);
    barplot( data\$V2, names.arg=data\$V1, col=colors, ylab="max Ncor in footprintDB", ylim=c(0,1), cex.lab=$CEX, cex.axis=$CEX)
    legend("topright", inset=.05, c("$reg cluster","$RNDSAMPLES random clusters"),
        fill=c('black','grey'), cex=$CEX/1.5)
    dev.off()
    q()
EOR
    close RSHELL; 

    print REPORT "\t<a href='./regulon$reg.peaks.footDB.report.png'><img width='$REPIMGWIDTH' src='./regulon$reg.peaks.footDB.report.png'></a>";
    system("cp -r $reg/regulon$reg.rm.fna.peaks-rm/ $outdir/regulon$reg.rm.fna.peaks-rm");

    # add links to footDB motifs

    my $edited_report = "$outdir/regulon$reg.rm.fna.peaks-rm/peaks_synthesis.html";
    open(EDITEDSYN,">",$edited_report) || warn "# cannot create $edited_report\n";

    open(SYN,"$reg/regulon$reg.rm.fna.peaks-rm/peaks_synthesis.html") || 
        warn "# cannot read $reg/regulon$reg.rm.fna.peaks-rm/peaks_synthesis.html\n";
    while(<SYN>)
    {
        #<td>ABF1:M00399:TRANSFAC</td>
        $_ =~ s/<td>(\S+?:\S+?:\S+?)<\/td>/<td><a href="$FOOTDBURL=$1" target="_blank">$1<\/a><\/td>/;
        print EDITEDSYN;
    }
    close(SYN);

    close(EDITEDSYN);


    print REPORT "<br><a href='regulon$reg.rm.fna.peaks-rm/peaks_small_summary.html'>logos</a> | ";
    print REPORT "<a href='regulon$reg.rm.fna.peaks-rm/peaks_synthesis.html'>summary</a>";

    # end report line
    print REPORT "\n";
}

# add source code and data to report
mkdir("$outdir/source");
foreach my $file (@sourcefiles)
{
    system("cp $file $outdir/source");
}

foreach my $dir (@sourcedirs)
{
    mkdir("$outdir/source/$dir");
    system("cp $dir/*.fna $outdir/source/$dir");
    system("cp $dir/*.txt $outdir/source/$dir");
}

print REPORT "# parameters:\n".
    "RNDSAMPLES=$RNDSAMPLES MAXSIGGO=$MAXSIGGO MAXSIG=$MAXSIG MINCOR=$MINCOR MINNCOR=$MINNCOR\n".
    "input data and code: <a href=\"./source/\">source</a>\n";

close(REPORT);

# make HTML report
system("text-to-html -i $outdir/report.tab -o $outdir/index.html");

######################################

sub get_Rplot_colors {
  my ($reference_idx,$n_of_orgs) = @_;
  my $colors = '';
  if($reference_idx > -1){
    if($reference_idx>0){ $colors = "rep('grey',$reference_idx)," }
    $colors .= "'black'"; # reference
    if($reference_idx<$n_of_orgs-1){ $colors .= sprintf(",rep('grey',%d)",($n_of_orgs-($reference_idx+1))) }
  }
  else{ $colors = "rep('grey',$n_of_orgs)" }

  return $colors;
}

