=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

package XrefParser::RefSeqCoordinateParser;

use strict;
use warnings;
use Carp;
use DBI;

use base qw( XrefParser::BaseParser );
use Bio::EnsEMBL::Registry;

sub run_script {

  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $species_name = $ref_arg->{species};
  my $file         = $ref_arg->{file};
  my $verbose      = $ref_arg->{verbose};
  my $db           = $ref_arg->{dba};
  my $dbi          = $ref_arg->{dbi};
  $dbi = $self->dbi unless defined $dbi;

  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
    croak "Need to pass source_id, species_id and file as pairs";
  }
  $verbose |=0;

  my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide', 'otherfeatures', $dbi);
  my $mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA', 'otherfeatures', $dbi);
  my $ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA', 'otherfeatures', $dbi);

  my $pred_peptide_source_id =
    $self->get_source_id_for_source_name('RefSeq_peptide_predicted', 'otherfeatures', $dbi);
  my $pred_mrna_source_id =
    $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','otherfeatures', $dbi);
  my $pred_ncrna_source_id =
    $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', 'otherfeatures', $dbi);

  if($verbose){
    print "RefSeq_peptide source ID = $peptide_source_id\n";
    print "RefSeq_mRNA source ID = $mrna_source_id\n";
    print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
    print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
    print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
    print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
  }

  my $user = "ensro";
  my $host;
  my $port = 3306;
  my $dbname;
  my $pass;
  my $transcript_score_threshold = 0.75;
  my $tl_transcript_score_threshold = 0.75;
  my $project;

# Grep the project name, should be ensembl or ensemblgenomes
  if($file =~ /project[=][>](\S+?)[,]/){
    $project = $1;
  }

# If specified, get core database connection details
  if($file =~ /host[=][>](\S+?)[,]/){
    $host = $1;
  }
  if($file =~ /port[=][>](\S+?)[,]/){
    $port =  $1;
  }
  if($file =~ /dbname[=][>](\S+?)[,]/){
    $dbname = $1;
  }
  if($file =~ /pass[=][>](\S+?)[,]/){
    $pass = $1;
  }
  if($file =~ /user[=][>](\S+?)[,]/){
    $user = $1;
  }

  my $ofuser = 'ensro';
  my $ofhost;
  my $ofport = 3306;
  my $ofdbname;
  my $ofpass;

# If specified, get otherfeatures database connection details
  if($file =~ /ofhost[=][>](\S+?)[,]/){
    $ofhost = $1;
  }
  if($file =~ /ofuser[=][>](\S+?)[,]/){
    $ofuser = $1;
  }
  if($file =~ /ofport[=][>](\S+?)[,]/){
    $ofport =  $1;
  }
  if($file =~ /ofdbname[=][>](\S+?)[,]/){
    $ofdbname = $1;
  }
  if($file =~ /ofpass[=][>](\S+?)[,]/){
    $ofpass = $1;
  }
 
  my $registry = "Bio::EnsEMBL::Registry";

  #get the species name
  my %id2name = $self->species_id2name($dbi);
  if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
  if (!defined $id2name{$species_id}) { next; }
  $species_name = $id2name{$species_id}[0];

  my $core_dba;
  my $otherf_dba;

  if (defined $project && $project eq 'ensembl') {
# Can use user-defined database
      if (defined $host) {
          $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
              '-host'     => $host,
              '-user'     => $user,
              '-pass'     => $pass,
              '-dbname'   => $dbname,
              '-port'     => $port,
              '-species'  => $species_name.$host,
              '-group'    => 'core',
       );
      } else {
# Else, database should be on staging
      $registry->load_registry_from_multiple_dbs(
          {
              -host    => 'mysql-ens-sta-1',
	      '-port'    => 4519,
              -user    => 'ensro',
          },
       );
      $core_dba = $registry->get_DBAdaptor($species_name,'core');
      }
      if (defined $ofhost) {
# Can use user-defined database
          $otherf_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
              '-host'     => $ofhost,
              '-user'     => $ofuser,
              '-pass'     => $ofpass,
              '-port'     => $ofport,
              '-dbname'   => $ofdbname,
              '-species'  => $species_name,
              '-group'    => 'otherfeatures',
       );
       $otherf_dba->dnadb($core_dba);
      } else {
# Else database should be on staging
      $registry->load_registry_from_multiple_dbs( 
	  {
	      -host    => 'mysql-ens-sta-1',
	      '-port'    => 4519,
	      -user    => 'ensro',
	  },
       );
      $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures') if !defined($ofhost);
      if (defined $otherf_dba) { $otherf_dba->dnadb($core_dba); }
    }
      

  } elsif (defined $project && $project eq 'ensemblgenomes') {
      $registry->load_registry_from_multiple_dbs( 
	  {
	      -host     => 'mysql-eg-staging-1.ebi.ac.uk',
	      -port     => 4160,
	      -user     => 'ensro',
	  },
	  {
	      -host     => 'mysql-eg-staging-2.ebi.ac.uk',
	      -port     => 4275,
	      -user     => 'ensro',
	  },
 
      );
      $core_dba = $registry->get_DBAdaptor($species_name,'core');
      $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');     

  } elsif (defined $db) {
    $otherf_dba = $db;
    $core_dba = $db->dnadb();
  } else {
      die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
  }

## Not all species have an otherfeatures database, skip if not found
  if (!$otherf_dba) {
    print STDERR "No otherfeatures database for $species_name, skipping import for refseq_import data\n";
    return;
  }

## Add link to EntrezGene IDs where available
  my (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
  my $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
  my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref  (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?, $entrez_source_id)");

  my $sa = $core_dba->get_SliceAdaptor();
  my $sa_of = $otherf_dba->get_SliceAdaptor();
  my $chromosomes_of = $sa_of->fetch_all('toplevel', undef, 1);

# Fetch analysis object for refseq
  my $aa_of = $otherf_dba->get_AnalysisAdaptor();
  my $logic_name;
  foreach my $ana(@{ $aa_of->fetch_all() }) {
    if ($ana->logic_name =~ /refseq_import/) {
      $logic_name = $ana->logic_name;
    }
  }
## Not all species have refseq_import data, skip if not found
  if (!defined $logic_name) {
    print STDERR "No data found for RefSeq_import, skipping import\n";;
    return;
  }

  foreach my $chromosome_of (@$chromosomes_of) {
    my $chr_name = $chromosome_of->seq_region_name();
    my $genes_of = $chromosome_of->get_all_Genes($logic_name, undef, 1);

    while (my $gene_of = shift @$genes_of) {
      my $transcripts_of = $gene_of->get_all_Transcripts();

# Create a range registry for all the exons of the refseq transcript
      foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
	my ($id, $tl_id);
	# We're moving to RefSeq accessions being stored as xrefs rather than
	# stable ids. But we also need to maintain backwards compatbility.
	# If it's the new kind, where there's a display_xref use that,
	# otherwise fall back to using the stable_id. But also check if we
	# have neither, then skip the record.
	if (defined $transcript_of->display_xref ) {
	  $id = $transcript_of->display_xref->display_id;
	} elsif (defined $transcript_of->stable_id) {
	  $id = $transcript_of->stable_id;
	} else {
	  # Skip non conventional accessions
	  next;
	}
        if ($id !~ /^[NXMR]{2}_[0-9]+/)  { next; }
        my %transcript_result;
        my %tl_transcript_result;
        if (!defined $id) { next; }
        my $exons_of = $transcript_of->get_all_Exons();
        my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
        my $tl_exons_of = $transcript_of->get_all_translateable_Exons();
        my $rr3 = Bio::EnsEMBL::Mapper::RangeRegistry->new();

        foreach my $exon_of (@$exons_of) {
          my $start_of = $exon_of->seq_region_start();
          my $end_of = $exon_of->seq_region_end();
          $rr1->check_and_register( 'exon', $start_of, $end_of );
        }

        foreach my $tl_exon_of (@$tl_exons_of) {
          my $tl_start_of = $tl_exon_of->seq_region_start();
          my $tl_end_of = $tl_exon_of->seq_region_end();
          $rr3->check_and_register( 'exon', $tl_start_of, $tl_end_of );
        }

# Fetch slice in core database which overlaps refseq transcript
        my $chromosome = $sa->fetch_by_region('toplevel', $chr_name, $transcript_of->seq_region_start, $transcript_of->seq_region_end);
        my $transcripts = $chromosome->get_all_Transcripts(1);

# Create a range registry for all the exons of the ensembl transcript
        foreach my $transcript(@$transcripts) {
          if ($transcript->strand != $transcript_of->strand) { next; }
          my $exons = $transcript->get_all_Exons();
          my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
          my $rr4 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
          my $exon_match = 0;
          my $tl_exons = $transcript->get_all_translateable_Exons();
          my $tl_exon_match = 0;

          foreach my $exon (@$exons) {
            my $start = $exon->seq_region_start();
            my $end = $exon->seq_region_end();
            my $overlap = $rr1->overlap_size('exon', $start, $end);
            $exon_match += $overlap/($end - $start + 1);
            $rr2->check_and_register('exon', $start, $end);
          }

          foreach my $tl_exon (@$tl_exons) {
            my $tl_start = $tl_exon->seq_region_start();
            my $tl_end = $tl_exon->seq_region_end();
            my $tl_overlap = $rr3->overlap_size('exon', $tl_start, $tl_end);
            $tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
            $rr4->check_and_register('exon', $tl_start, $tl_end);
          }

          my $exon_match_of = 0;
          my $tl_exon_match_of = 0;

# Look for oeverlap between the two sets of exons
          foreach my $exon_of (@$exons_of) {
            my $start_of = $exon_of->seq_region_start();
            my $end_of = $exon_of->seq_region_end();
            my $overlap_of = $rr2->overlap_size('exon', $start_of, $end_of);
            $exon_match_of += $overlap_of/($end_of - $start_of + 1);
          }

          foreach my $tl_exon_of (@$tl_exons_of) {
            my $tl_start_of = $tl_exon_of->seq_region_start();
            my $tl_end_of = $tl_exon_of->seq_region_end();
            my $tl_overlap_of = $rr4->overlap_size('exon', $tl_start_of, $tl_end_of);
            $tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
          }

# Comparing exon matching with number of exons to give a score
          my $score = ( ($exon_match_of + $exon_match)) / (scalar(@$exons_of) + scalar(@$exons) );
          my $tl_score = 0;
          if (scalar(@$tl_exons_of) > 0) {
            $tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
          }
          if ($transcript->biotype eq $transcript_of->biotype) {
            $transcript_result{$transcript->stable_id} = $score;
            $tl_transcript_result{$transcript->stable_id} = $tl_score;
          } else {
            $transcript_result{$transcript->stable_id} = $score * 0.90;
            $tl_transcript_result{$transcript->stable_id} = $tl_score * 0.90;
          }
        }

        my $best_score = 0;
        my $best_tl_score = 0;
        my $best_id;
        my ($score, $tl_score);
# Comparing the scores based on coding exon overlap
# If there is a stale mate, chose best exon overlap score
        foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
          $score = $transcript_result{$tid};
          $tl_score = $tl_transcript_result{$tid};
          if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
            if ($tl_score >= $best_tl_score) {
              if ($tl_score > $best_tl_score) {
                $best_id = $tid;
                $best_score = $score;
                $best_tl_score = $tl_score;
              } elsif ($tl_score == $best_tl_score) {
                if ($score > $best_score) {
                  $best_id = $tid;
                  $best_score = $score;
                }
              }
            } elsif ($score >= $best_score) {
              $best_id = $tid;
              $best_score = $score;
            }
          }
        }

# If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
        if ($best_id) {
          my ($acc, $version) = split(/\./, $id);
	  $version =~ s/\D//g if $version;
          my $source_id;
          $source_id = $mrna_source_id if $acc =~ /^NM_/;
          $source_id = $ncrna_source_id if $acc =~ /^NR_/;
          $source_id = $pred_mrna_source_id if $acc =~ /^XM_/;
          $source_id = $pred_ncrna_source_id if $acc =~ /^XR_/;
          # Accession should be of format NM_/XM_/NR_/XR_ otherwise it is not valid
          if (!defined $source_id) { next; }
          my $xref_id = $self->add_xref({ acc => $acc,
                                          version => $version,
                                          label => $id,
                                          desc => undef,
                                          source_id => $source_id,
                                          species_id => $species_id,
                                          dbi => $dbi,
                                          info_type => 'DIRECT' });
          $self->add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi);

	  my $t_of = $transcript_of;
          my $g_of = $t_of->get_Gene();
          my $entrez_id = $g_of->stable_id();
          my $tl_of = $t_of->translation();
          my $ta = $core_dba->get_TranscriptAdaptor();
          my $t = $ta->fetch_by_stable_id($best_id);
          my $tl = $t->translation();

# Add link between Ensembl gene and EntrezGene
          if (defined $entrez_ids{$entrez_id} ) {
            foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
              $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id);
            }
          }

# Also store refseq protein as direct xref for ensembl translation, if translation exists
          if (defined $tl && defined $tl_of) {
            if ($tl_of->seq eq $tl->seq) {
              $tl_id = $tl_of->stable_id();
              my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$tl_of->get_all_DBEntries};
              if(scalar @xrefs == 1) {
                $tl_id = $xrefs[0]->primary_id();
              }
              ($acc, $version) = split(/\./, $tl_id);
              $source_id = $peptide_source_id;
              $source_id = $pred_peptide_source_id if $acc =~ /^XP_/;
              my $tl_xref_id = $self->add_xref({ acc => $acc,
                                              version => $version,
                                              label => $tl_id,
                                              desc => undef,
                                              source_id => $source_id,
                                              species_id => $species_id,
                                              dbi => $dbi,
                                              info_type => 'DIRECT' });
              $self->add_direct_xref($tl_xref_id, $tl->stable_id(), "Translation", "", $dbi);
            }
          }
        }
      }
    }
  }
  return 0;
}

1;
