#!/usr/bin/env perl
############################################################
#
# $Id: install-organism,v 1.129 2013/10/13 12:24:00 jvanheld Exp $
#
# Time-stamp: <2017-03-17 10:53:06 eda>
#
############################################################

## TO DEBUG: with option -org_list all organisms get the same name
## (the first one of the list). JvH 2016-02-17.

# use strict;
BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";
use RSAT::util;
use RSAT::Tree;
use RSAT::TreeNode;
use RSAT::organism;


if ($0 =~ /([^(\/)]+)$/) {
  push (@INC, "$`lib/");
}

## Main package
package main;
{

  ################################################################
  ## Initialize parameters
  local $start_time = &RSAT::util::StartScript();
  our $org_table = $ENV{RSAT}."/public_html/data/supported_organisms.tab";
  our @masking_modes = ("");	## by default, use no masking
  our @oligo_lengths=(6,1,2,3,4,5,7,8);
  our $verbose = 0;

  our $seq_format = "fasta"; ## default format for exporting sequences
  our $img_format = $ENV{rsat_img_format} || "png";
  our $noov="";
  our $strands="-1str";
  our $purged_frequencies = 0; #### temporarily inactivated because for human genome it suppresses almost all sequences !!!!
  our %supported_for_installation;
  $supported_for_installation{"fasta"} = 1;
  $supported_for_installation{"raw"} = 1;
  $supported_for_installation{"filelist"} = 1;

  our @default_tasks = qw(
      start_stop
      allup
      seq_len_distrib
      genome_segments
      upstream_freq
      protein_seq
      protein_freq
      protein_len
      oligos
      dyads
      fasta_genome
      fasta_genome_rm
      chrom_sizes
      index_bedtools
      );
#      config ## JvH suppresed config from default tasks to enable submission to job scheduler without destroying the table of supported organisms
  
  our @extra_tasks = qw(
      config
      genome_freq
      phylogeny
      uninstall
      erase
      distrib
      ncf
      intergenic_freq
      ensembl_freq
      clean
      default
      rewrite
      all
      );

  ## Tasks to handle download and parsing of genomes from NCBI ftp/rsync site.
  our @species_tasks = qw(
      available
      download
      list
      parse);

  ## Tasks compatible with batch execution
  our @batch_tasks = qw(
      phylogeny
      start_stop
      allup
      seq_len_distrib
      genome_segments
      upstream_freq
      genome_freq
      protein_seq
      protein_freq
      protein_len
      oligos
      dyads
      default
      fasta_genome
      fasta_genome_rm
      chrom_sizes
      index_bedtools
      distrib
      ncf
      intergenic_freq
      ensembl_freq
      );
  
  our @organism_tasks = (@default_tasks, @extra_tasks);
  foreach my $task (@organism_tasks) {
    $organism_task{$task} = 1;
  }

  foreach my $task (@species_tasks) {
    $species_task{$task} = 1;
  }

  our @supported_tasks = (@species_tasks, @organism_tasks);
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  $supported_tasks = join ",", @supported_tasks;

  foreach my $task (@batch_tasks) {
    $batch_task{$task} = 1;
  }

  our %task = ();
  our @tasks = ();
  $parse_options = "";

  $null = "<NA>";		## NULL value
  $source = $null;
  $taxonomy = $null;
  $taxid = $null;
  @taxa = ();
  our %infile = ();

  ## Export format for the lists of organisms or assemblies
  %supported_list_format = ("org" => 1,
			    "table" => 1,
			    "bash" => 1);
  our $list_format = "table"; # Default list format. 
  
  ## File containing a list of organisms to install
  $infile{organisms} = "";

  ## File containing a list of species to install from NCBI/Refseq FTP site
  $infile{species} = "";

  ## List of organisms
  our @species = (); # for download and install
  our @organisms = (); # organisms are concatenation of species + strain

  ## Strain
  our $strain = "";
  our $max_strains = ""; # max number of strains per species for parsing task

  ## Group
  our $group = "";

  ## options for testing and debugging
  $skip = 0;  ## Skip the top organisms from the lsit
  $last = 0;  ## Stop after a given number of organisms from the list

  our $backup_org_table = 0;
  
  ## installation date
  local $install_date = $force_date || &AlphaDate();
  chomp $install_date;

  ## Store the starting directory
  $dir{main} = $ENV{PWD};

  ## Directory to download files from NCBI
  $dir{downloads} = $ENV{DOWNLOAD_DIR} || $ENV{RSAT}."/downloads/";
#  $dir{refseq} = $ENV{REFSEQ_DIR} || $ENV{RSAT}."/downloads/refseq";

  ## Options for the job scheduler
  our $dry = 0;
  our $die_on_error = 0;
  our $batch = 0;
  our $job_prefix = "install_org";
  our $log_handle = "";
  our $err_handle = "";
  our $cluster_queue = $ENV{CLUSTER_QUEUE};
  our $download_date = "";

  ################################################################
  ## Read arguments
  &ReadArguments();

  ################################################################
  ## Open output stream 
  our $out = &OpenOutputFile($outputfile);

  ################################################################
  ## Check parameter values

  @task = keys %task;
  if ($#task == -1) {
    &RSAT::error::FatalError("You should specify at least one task.\nSupported tasks\n\t$supported_tasks\n");
  }

  ## Set on the default tasks is the task "default" was invoked
  if ($task{default}) {
      foreach my $task (@default_tasks) {
	  $task{$task} = 1;
      }
      &RSAT::message::Info("Default tasks selected: ", join(", ", @default_tasks)) if ($main::verbose >= 2);
  }

  ## Run all the tasks for each organism (not the species-wide tasks
  ## like downloading and parsing). Note: by "organism" we mean here a
  ## given strain of a given species.
  if ($task{all}) {
    %task = %organism_task;
  }


  ## Check compatibility between batch and task dependencies
  if ($batch) {
    if ($task{allup} && $task{upstream_freq}) {
      &RSAT::error::FatalError("Tasks allup and upstream_freq cannot be combined in batch mode. First run allup, then upstream_freq.");
    }

    if ($task{config}) {
	&RSAT::error::FatalError("Task config cannot be executed in batch mode. ");
    }
  }

  ## Consistency between tasks
  if ($task{oligos} || $task{dyads}) {
      unless (($task{upstream_freq}) ||
	      ($task{intergenic_freq}) ||
	      ($task{protein_freq}) ||
	      ($task{genome_freq}) ||
	      ($task{ensembl_freq})) {
	  &RSAT::error::FatalError("The tasks 'oligos' and 'dyads' require to specify at least one sequence type among the following. ",
				   "\n\tupstream_freq,intergenic_freq,genome_freq,protein_freq,ensembl_freq");
    }
  }
  if (($task{protein_freq}) && !($task{oligos})) {
    &RSAT::error::FatalError("The task 'protein_freq' requires to activate the task 'oligos'");
  }

  ## Count the number of species-wide and organism-specific takss
  our $requested_species_tasks = 0;
  our $requested_organism_tasks = 0;
  &RSAT::message::TimeWarn("Checking tasks", join(";", keys(%task))) if ($main::verbose >= 2);
  foreach my $task (keys(%task)) {
      if ($species_task{$task}) {
	  $requested_species_tasks++;
      }
      if ($organism_task{$task}) {
	  $requested_organism_tasks++;
      }
  }
  &RSAT::message::Info("Species tasks", $requested_species_tasks) if ($main::verbose >= 2);
  &RSAT::message::Info("Organism tasks", $requested_organism_tasks) if ($main::verbose >= 2);
  

  ################################################################
  ## If a species file has been specified, read species list
  if ($infile{species}) {
    &RSAT::message::TimeWarn("Reading species file", $infile{species}) if ($main::verbose >= 1);
    my ($speciess) = &OpenInputFile($infile{species});
    while (<$speciess>) {
      next unless (/\S/); ## Skip empty rows
      next if (/^;/); ## Skip comment rows
      next if (/^#/); ## Skip header rows
      chomp();
      my ($species) = split /\s+/;
      push @species, $species;
    }
    &RSAT::message::Info(scalar(@species), "species read from file", $infile{species}) if ($main::verbose >= 1);
  }


  ## Index the selected species
  our %selected_species = ();
  for my $species (@species) {
      $selected_species{$species} = 1;
  }
  &RSAT::message::Info("Selected species",  join("; ", keys(%selected_species))) if ($main::verbose >= 1);
  
  ################################################################
  ## Define the local download directory for NCBI Refseq files
  #$source = "NCBI-Refseq";
  our $source = "NCBI"; ## 2019 this format works better for the newest formating in NCBI
  
  ## Check the directory containing the Genbank-formatted organisms
  ## (which can be downloaded from NCBI/refseq).
  unless ($dir{refseq}) { # Refseq dir specified on the command line
      $dir{refseq} = $dir{downloads}."/refseq";      
  }
  &RSAT::util::CheckOutDir($dir{refseq});
  

  ################################################################
  ## Check consistency of arguments for download/list/parse tasks.
  if ($task{parse} || $task{download} || ($task{config} && ($group))) {
    ## Check that all required parameters are defined

    ## One taxonomic group
    unless ($group) {
      &RSAT::error::FatalError("Tasks download, list and parse require to specify a group (e.g. fungi, bacteria, ...) to which all the species to download should belong.");
    }

    ## At least one species is required
#    unless (scalar(@species) >= 1) {
#      &RSAT::error::FatalError("Tasks download, list, parse and config require to specify at least one species (e.g. Saccharomyces_cerevisiae, Escherichia_coli). To get a list of available species, use option '-task> available'. ");
#    }

    ## If strain is specified, check that only one species has been requested
    if ($strain) {
      &RSAT::error::FatalError("Option -strain is only compatible with a single species. ") unless (scalar(@species) == 1);
    }
  }

  our %remaining_task = %task;
  

  ################################################################
  ## Assemblies
  
  our %strain_info;
  our @sorted_strains;
  our $selected_strains;

  
  if ((($infile{organisms} eq "") && (scalar(@organisms) == 0) && ($taxon eq ""))
      || $task{available} 
      || $task{list} 
      || $task{download} || $task{parse}) {

      ################################################################
      ## Define the name of the assembly summary file
      ## File containing the assembly summary downloaded from NCBI
      
      #  my $today = &trim(`date +%Y-%m-%d`);
      #  $outfile{assembly_summary} = "ncbi_organisms_assembly_summary_".${group}."_refseq_".${today}.".tsv";  
      # $outfile{assembly_summary} = $dir{downloads}."/ncbi_organisms_assembly_summary_".${group}."_refseq.tsv";
      unless ($group) {
	  &RSAT::error::FatalError("Use the option -group to specify the taxonomic group");
      }
      $dir{assembly_summary} = join("/", $dir{downloads}."refseq", $group);
      $outfile{assembly_summary} = join("/", $dir{assembly_summary}, "/assembly_summary.txt");
      $outfile{assembly_summary} =~ s|//assembly|/assembly|;
      $outfile{assembly_columns} = $outfile{assembly_summary}; 
      $outfile{assembly_columns} =~ s/\.txt/_colums.tsv/;
#  $outfile{assembly_columns} = "ncbi_organisms_assembly_summary_".${group}."_refseq_".${today}."_columns.tsv";

      ################################################################
      ## Download or update the assembly summary table for the selected group
      if ($task{available}) {
	  &DownloadAssemblySummary($group);

	  ## If all tasks have been completed, skip the rest
	  delete($remaining_task{available});
	  if (scalar(keys(%remaining_task)) == 0) {
	      &Finish();
	  }
      }

      ################################################################
      ## Read assembly summary table
      %strain_info = &ReadAssemblySummary($outfile{assembly_summary});
      @sorted_strains = sort {$strain_info{$a}->{"organism"} cmp $strain_info{$b}->{"organism"}} keys(%strain_info);
      $selected_strains = scalar(keys(%strain_info));

  }

  ## List assemblies if requested
  if ($task{list}) {
      &ListAssemblies($list_format, %strain_info);

      ## If all tasks have been completed, skip the rest
      delete($remaining_task{list});
      if (scalar(keys(%remaining_task)) == 0) {
	  &Finish();
      }
  }


  ################################################################
  ##  Print arguments if required
  &Verbose() if ($main::verbose >= 1);
  
  
  ## Rewrite config table
  if($task{rewrite}){
      &RewriteConfigTab();
      exit();
  }

  ################################################################
  ## Download the required files from NCBI FTP site
  if ($task{download}) {

      ## Download each selected strain
      my $strain_nb = 0;
      &RSAT::message::Info("Downloading", $selected_strains, "assemblies from NCBII") if ($main::verbose >= 1);

      for my $strain (@sorted_strains) {
	  $strain_nb++; 
	  if (($skip > 0) && ($strain_nb <= $skip)) {
	      &RSAT::message::TimeWarn("Skipping assembly", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"})
		  if ($main::verbose >= 1);
	      next;
	  }
	  if (($last > 0) && ($strain_nb > $last)) {
	      &RSAT::message::TimeWarn("Stopping after assembly", ($strain_nb-1)."/".$selected_strains, $strain_info{$strain}->{"organism"})
		  if ($main::verbose >= 1);
	      last;
	  }
	  &RSAT::message::TimeWarn("Downloading strain", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"})
	      if ($main::verbose >= 1);

	  ## Restrict the number of strains if requested
	  if (($max_strains > 0) && ($strain_nb > $max_strains)) {
	      my $remaining_strains = $selected_strains - $strain_nb + 1;
	      &RSAT::message::Info("Max number of strains reached", $max_strains, "Skipping", $remaining_strains, "strains of", $selected_strains)
		  if ($main::verbose >= 1);
	      last;
	  }


	  ## Define source URL for download
	  my $URL = $strain_info{$strain}->{"ftp_path"};
	  unless ($URL =~ "ftp.ncbi.nlm.nih.gov") {
	      &RSAT::error::FatalError("Strain", $strain, "Invalid URL for NCBI download", "'".$URL."'");
	  }
	  if ($main::verbose >= 1) {
	      &RSAT::message::TimeWarn("Downloading genome for strain", $strain,
				       "\n\tfrom NCBI FTP site", $URL,
				       "\n\tto local dir", $strain_info{$strain}->{"downloads"});
	  }

	  ## Create local download directory if needed
	  &RSAT::util::CheckOutDir($strain_info{$strain}->{"downloads"});

	  ## Build the download command
	  $RSYNC_URL = $URL;
	  $RSYNC_URL =~ s/https/rsync/;
	  $EXCLUDE = " --exclude '*_graph.bw'";
	  $EXCLUDE .= " --exclude RefSeq_transcripts_alignments";
	  my $command = "rsync --copy-links --times --recursive --verbose ".${EXCLUDE}." ".$RSYNC_URL."/* ".$strain_info{$strain}->{"downloads"}."/";

	  ## Download data from NCBI
	  &doit($command, $dry, $die_on_error, $main::verbose);
	  &RSAT::message::TimeWarn("Downloaded genome in dir", $strain_info{$strain}->{"downloads"}) if ($main::verbose >= 3);

	  ## Store download date in strain_info
	  $download_date = &AlphaDate();
	  $strain_info{$strain}->{"download_date"} = $download_date;
      }
  }


  ## Parse 
  if ($task{parse}) {
      my $strain_nb = 0;
#      my $selected_strains = scalar(keys(%strain_info));
      if ($main::strain) {
	  @sorted_strains = $strain;
	  $species = $species[0];
	  unless ($species) {
	      &FatalError("Option -strain requires to specify species (i.e. Genus_species) with option -species");
	  }
	  my ($genus, @species_pieces) = split(/[ _]/, $species);
	  my $species = join "_", @species_pieces; ## join the elements of species, which sometimes contain sub-species levels e.g. strain
	  my $organism = join("_", $genus, $species, $strain);
	  $strain_info{$strain}->{"strain"} = $strain;
	  $strain_info{$strain}->{"species"} = $species;
	  $strain_info{$strain}->{"genus_species"} = $genus."_".$species;
	  $strain_info{$strain}->{"organism"} = $organism;
	  $strain_info{$strain}->{"downloads"} =
	      join("/",
		   $ENV{RSAT},
		   "downloads",
		   "refseq",
		   $group,
		   $strain_info{$strain}->{"genus"},
		   $strain_info{$strain}->{"genus_species"},
		   $strain);

	  &RSAT::message::Debug("organism= ", $organism, "\n",
				"genus= ", $genus, "\n",
				"species= ", $species, "\n",
				"genus_species= ", $strain_info{$strain}->{"genus_species"}, "\n",
				"strain= ", $strain, "\n",
				"downloads= ", $strain_info{$strain}->{"downloads"}, "\n",
	      ) if ($main::verbose >= 5);
#	  die "HELLO"; 

	  &RSAT::message::Info("Parsing a single strain specified with options -strain ${strain} -species ${species}") if ($main::verbose >= 1);
      }
      
      for my $strain (@sorted_strains) {
	  $strain_nb++; 
	  if (($skip > 0) && ($strain_nb <= $skip)) {
	      &RSAT::message::TimeWarn("Skipping assembly", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"})
		  if ($main::verbose >= 1);
	      next;
	  }
	  if (($last > 0) && ($strain_nb > $last)) {
	      &RSAT::message::TimeWarn("Stopping after assembly", ($strain_nb-1)."/".$selected_strains, $strain_info{$strain}->{"organism"})
		  if ($main::verbose >= 1);
	      last;
	  }
	  &RSAT::message::TimeWarn("Parsing strain", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"})
	      if ($main::verbose >= 1);
	  my $organism_name = $strain_info{$strain}->{"organism"};
	  my $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_name."/genome";
	  &ParseGenome($strain_info{$strain}->{"species"},
		       $strain,
		       $organism_name,
		       $strain_info{$strain}->{"downloads"},
		       $genome_dir,
		       $org_nb,
		       %{$strain_info{$strain}});
      }
  }



  ################################################################
  ## Select organisms to install
  if ($main::all_organisms) {
      ## Select all organisms
      @organisms = &RSAT::OrganismManager::get_supported_organisms();

  } elsif ($infile{organisms}) {
      ## If an organism file has been specified, read organism list
      my ($orgs) = &OpenInputFile($infile{organisms});
      while (<$orgs>) {
	  next unless (/\S/); ## Skip empty rows
	  next if (/^;/); ## Skip comment rows
	  next if (/^#/); ## Skip header rows
	  chomp();
	  my ($org) = split /\s+/;
	  push @organisms, $org;
      }

  } elsif (scalar(@taxa) > 0) {
      
      ## If a taxon has been specified, collect the list of
      ## organisms belonging to this taxon in the RSAT-supported
      ## organisms
      if ($task{parse}) {
	  &RSAT::message::Warning("With the option -taxon, the parse option will only re-parse previously installed organisms from the selected taxon");
      }
      
      $tree = new RSAT::Tree();
      $tree->LoadSupportedTaxonomy("Organisms", \%supported_organism);
      
      foreach my $taxon (@taxa) {
	  my $taxon_node = $tree->get_node_by_id($taxon);
	  my @taxon_organisms = $taxon_node->get_all_descendents("DFS","leaf",undef,undef);
	  &RSAT::message::TimeWarn("Collected", scalar(@taxon_organisms), "organisms for taxon", $taxon, $taxon_node) if ($main::verbose >= 1);
	  foreach my $org_node (@taxon_organisms) {
	      my $short_organism_name = $org_node->getid();
	      push @organisms, $short_organism_name;
	  }
      }   
      @organisms = sort (@organisms);
  
  } elsif (scalar(@organisms) == 1) {
      ## Install all organisms found in assembly summary and that
      ## passed the filters if specified
      for my $strain (@sorted_strains) {
	  push @organisms, $strain_info{$strain}->{"organism"};
      }
  } elsif (scalar(@sorted_strains) > 0) {
      for my $strain (@sorted_strains) {
	  push @organisms, $strain_info{$strain}->{"organism"};
      }
  }

      
  &RSAT::message::Info("Organisms to install", scalar(@organisms)) if ($main::verbose >= 1);
  # if ($main::verbose >= 10) {
  #     &RSAT::message::Info(join("; ", @organisms));
  # }


  ## Check that at least one organism has been specified
  unless ($task{list}) { 
      unless ((scalar(@organisms) > 0) || ($task{parse})) {
	  # if ($strain && $species[0]) {
	  #     @organisms = (join("_", $species, $strain));
	  # } else {
	      &RSAT::error::FatalError("At least one organism should be specified (options -org, -taxon or -all_organisms)")
	  # }
      }
  }


  ################################################################
  ## Some options are incompatible with multiple organisms
  if (scalar(@organisms) > 1) {
      &RSAT::message::FatalError("The option -organism (organism full name) is not valid when multiple organisms are selected.")
	  if ($user_specified_orgname);
      &RSAT::message::FatalError("The option -dir is (installation directory) not valid when multiple organisms are selected.")
	  if ($install_dir);
      &RSAT::message::FatalError("The option -syn (synonym table) is not valid when multiple organisms are selected.")
	  if ($outfile{synonyms});
  }


  ################################################################
  ## Check if any task has to be done on organisms (and not just to
  ## NCBI species).
  if ($requested_organism_tasks == 0) {
      &RSAT::message::Warning("No organism-specific task has been requested. ") if ($main::verbose >= 1);
      exit(0);
  }

  
  ################################################################
  ## Iterate installation over selected organisms
  $i = 0;
  foreach our $organism_short_name (@organisms) {
    $i++;
    if (($skip > 0) && ($i <= $skip)) {
	&RSAT::message::TimeWarn("Skipping organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);
	next;
    }
    if (($last > 0) && ($i > $last)) {
	&RSAT::message::TimeWarn("Stopping after organism", ($i-1)."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);
	last;
    }
    &RSAT::message::TimeWarn("Installing organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);

    $backup_org_table = 0 if ($i > 1); ## When we treat multiple organisms, we don't want to store a backup for each one
    
    if ($batch_org) {
	my $command = "install-organism";
	$command .= " -v ".$main::verbose;
	$command .= " -group ".$group if ($group);
	$command .= " -org ".$organism_short_name;

	my @tasks_to_batch = ();
	foreach my $task (@tasks) {
	    if ($batch_task{$task}) {
		push @tasks_to_batch, $task;
	    }
	}
	my $org_tasks = join(",", @tasks_to_batch);
	$command .= " -task ".$org_tasks;
	&RSAT::message::Info("Organism batch command\t", $command) if ($main::verbose >= 1);
	&doit($command, $dry, $die_on_error, $main::verbose, $batch_org, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
	next;
    }

    unless (($task{erase}) || ($task{uninstall})) {

      ## Create an object to hold the organism attribtues
      $organism = new RSAT::organism();
#    $organism->check_name($organism_short_name, 1);
      $organism->force_attribute("ID", $organism_short_name);
      $organism->set_attribute("feature_types", '*');
      
      ################################################################
      ## If the organism was previously supported, ensure that the
      ## serialized files are suppressed.
      &RSAT::message::Info("Cleaning previous serialized files") if ($main::verbose >= 3);
      $organism->delete_serial_files();


      ################################################################
      ## Automatic full name specification
      my $organism_full_name = "";
      unless ($organism_full_name{organism_short_name}) {
	  if ($user_specified_orgname) {
	      ## Full name specified with option -organism on the command line
	      $organism_full_name = $user_specified_orgname;
	  } else {
	      ## Build organism full name from the organism short name (replace _ by spaces)
	      $organism_full_name = $organism_short_name;
	      if (($source eq 'ensemblgenomes') || ($source eq 'ensembl')){
		  $organism_full_name = (defined $species[0] && !($species[0] eq '')) ? ucfirst($species[0]) : $organism_short_name;
		  @names = split($organism_full_name."_", $organism_short_name);
		  $organism->set_attribute("genome_assembly", $names[1]);
	      }
	      $organism_full_name =~ s/\_/ /g;
	  }
      }
      #      $organism->set_attribute("name", $organism_short_name);
      $organism->set_attribute("name", $organism_full_name);
      $organism_full_name{$organism_short_name} = $organism_full_name;
      
      
      ################################################################
      ## Installation directories and files
      umask 0002;
      
      ## TO DO (JvH, 2017-12-12): below and elsewhere in the script,
      ## replace the global hash tables by attributes set on the
      ## organism object;
      
      if ($install_dir) {
	  $dir{install} = $install_dir;
	  &RSAT::message::Info("Installing genome in directory specified on the command line", $dir{install}) if ($main::verbose >= 3);
      } elsif ($supported_organism{$organism_short_name}->{'data'}) {
	  $dir{install} = $supported_organism{$organism_short_name}->{'data'};
	  &RSAT::message::Info("Installing genome in directory previously specified in the config file", $dir{install}) if ($main::verbose >= 3);
      } else {
	  $dir{install} = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name;
	  &RSAT::message::Info("Installing genome in default directory", $dir{install}) if ($main::verbose >= 3);
      }
      
      $dir{genome} = $dir{install}."/genome";
      $dir{oligos} = $dir{install}."/oligo-frequencies";
      $outfile{features} = $dir{genome}."/feature.tab";
      $outfile{synonyms} = $dir{genome}."/feature_names.tab";
      $outfile{genome} = $dir{genome}."/contigs.txt";
      $outfile{fasta_genome} = $dir{genome}."/".$organism_short_name.".dna.genome.fa";
      $outfile{chrom_sizes} = $dir{genome}."/".$organism_short_name."_chrom_sizes.txt";
      $outfile{fasta_genome_rm} = $dir{genome}."/".$organism_short_name.".dna_rm.genome.fa";
      
      ## Create output directoryies if required
      unless ($dry) {
	  &RSAT::util::CheckOutDir($dir{install});
	  &RSAT::util::CheckOutDir($dir{genome});
	  &RSAT::util::CheckOutDir($dir{oligos});
      }
    }
    
    ################################################################
    ###################### Installation tasks ######################
    ################################################################

    ## Open an output stream for messages
    &OrgParameters() if ((scalar(@organisms) ==1) || ($main::verbose >= 3));
    
    &Erase() if ($task{erase});

    &Uninstall() if ($task{uninstall});
    
    unless (($task{uninstall}) || ($task{erase})) {
	
	if ($task{config}) {
	    $config_ok = &UpdateConfig();
	    next unless ($config_ok);
	}
      
      &CreatePhylogeny() if ($task{phylogeny});
      
      &StartAndStopCodons() if ($task{start_stop});
      
      &IntergenicSegments() if ($task{genome_segments});
      
      &AllUpstream() if ($task{allup});

      &FastaGenomeAndBedtoolsIndex() if ($task{fasta_genome} || $task{chrom_sizes} || $task{index_bedtools});

      ## Suppress empty lines in protein sequence files
      our $peptidic_seq_file = $dir{genome}."/".${organism_short_name}."_aa.fasta";
      &CleanProteinSequences($peptidic_seq_file) if ($task{protein_seq});
	
      ## Compute length distribution of protein sequence
      if ($task{protein_len}) {
	  my $command = &SeqLengthDistributionCmd($peptidic_seq_file, "fasta", "protein", 50, 2000);
	  &doit($command, $dry, $die_on_error, $main::verbose);
      }


      &CalcFrequencies() if (($task{oligos}) || ($task{distrib}) || ($task{dyads}));
      
      # New name for peptidic sequence file (Sept 2015). I suppress
      # organism name from sequence file name.
#      my $peptidic_seq_file = "peptidic_sequences.fasta";
#      unless (-e $dir{genome}."/".$peptidic_seq_file) {
	## Previous name for the peptidic sequence file
#	$peptidic_seq_file = ${organism_short_name}."_aa.fasta";
#      }



      &CleanUp() if ($task{clean});
      
      ## Touch the installation dir to indicate the last modification date
      $dir{main};
      chdir($dir{main});
      &RSAT::message::Info("Working dir",  $ENV{PWD}) if ($main::verbose >= 4);
      system "touch $dir{install}";
    }


  }

  &Finish();
}


################################################################
##################  SUBROUTINE DEFINITIONS  ####################
################################################################

sub Finish {
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  close $out if ($outputfile);
  exit(0);
}

################################################################
## Display verbosity message
sub Verbose {
  print $out "; install-organism ";
  &PrintArguments($out);

  print $out ";\n; Tasks:\n";
  foreach my $task (sort(keys(%task))) {
    print $out ";\t$task\n";
  }
  
  print $out ";\n; Supported organism table\n;\t", $main::org_table, "\n";

  
}


################################################################
## Print organism parameters
sub OrgParameters {
  print $out ";\n; Organism parameters:\n";
  if ($infile{organisms}) {
    printf $out ";    %-25s\t%s\n", "organism file", $infile{organisms};
  } else {
    printf $out ";    %-25s\t%s\n", "ID", $organism_short_name;
    printf $out ";    %-25s\t%s\n", "Name", $organism_full_name{$organism_short_name}; ## THIS IS WRONG (JvH, 2017-12-12)
    printf $out ";    %-25s\t%s\n", "Update date", $install_date;
    printf $out ";    %-25s\t%s\n", "Data source", $source;
    printf $out ";    %-25s\t%s\n", "strain", $strain;
    if ($strain_info{$strain}) {
	for my $field (qw(assembly_accession
                          taxid
			  species_taxid
                          organism_name
                          infraspecific_name
                          version_status
                          assembly_level 
                          seq_rel_date
                          asm_name
		          group
		          genome_size
  		          genome_size_ungapped
 		          contig_count
 		          total_gene_count
 		          protein_coding_gene_count
 		          non_coding_gene_count
		       )) {
	    printf $out ";    %-25s\t%s\n", $field, $strain_info{$strain}->{$field};
	}
    }
  }
  

  ## Directories
  print $out "; Directories\n";
  for my $d (sort(keys(%dir))) {
      printf $out ";    %-25s\t%s\n", $d, $dir{$d};
  }

  ## Input files
  print $out "; Input files\n";
  for my $f (sort(keys(%infile))) {
      printf $out ";    %-25s\t%s\n", $f, $infile{$f};
  }

  ## Output files
  print $out "; Output files\n";
  for my $f (sort(keys(%outfile))) {
      printf $out ";    %-25s\t%s\n", $f, $outfile{$f};
  }
}


################################################################
## Uninstall an organism by deleting its row from the configuration
## file. This does not clean the organism directory on the hard drive
## (which requires the task "erase").
sub Uninstall {
    foreach our $organism_short_name (@organisms) {
	&RSAT::OrganismManager::check_name($organism_short_name, 1);
	if (defined($supported_organism{$organism_short_name})) {
	    &RSAT::message::Warning("Uninstalling organism", $organism_short_name) if ($main::verbose >= 1);
	    delete($supported_organism{$organism_short_name});
	}
    }
    &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table);
}

################################################################
## Erase an organism by deleting its whole folder in
## RSAT/public_html/data/genomes directory.
sub Erase {
  foreach our $organism_short_name (@organisms) {
    &RSAT::message::Warning("Erasing organism", $organism_short_name) if ($main::verbose >= 1);
      
    ## Identify the data directory for the organism
    my $organism_data_dir = "";
    if ((defined($supported_organism{$organism_short_name})) &&
	(defined($main::supported_organism{$organism_short_name}->{'data'}))) {
      $organism_data_dir = $main::supported_organism{$organism_short_name}->{'data'};
    } else {
      $organism_data_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name;
    }
    
    ## Remove the directory
    if (-e $organism_data_dir) {
      &RSAT::message::Warning("Erasing directory", $organism_data_dir) if ($main::verbose >= 2);
      &doit("rm -rf ".$organism_data_dir, $dry, $die_on_error, $main::verbose);
    } else {
      &RSAT::message::Warning("No need to erase unexisting directory", $organism_data_dir);
    }

  }
}

################################################################
## Update configuration table for the current organism
sub UpdateConfig {

  ## Read taxonomy from the current organism
  my $organism_table = $dir{install}."/genome/organism.tab";
  my $id = $null;
  $taxid = $null;
  $taxonomy = $null;

  unless (-e $organism_table) {
    if ($main::die_on_error) {
      &RSAT::error::FatalError("Missing organism table", $organism_table);
    } else {
      &RSAT::message::Warning("Missing organism table", $organism_table);
      return(0);
    }
  }

  ($org_handle) = &OpenInputFile($organism_table);
  my $taxid_field = 1; ## Default field for the taxonomic ID
  my $taxonomy_field = 2; ## Default field for the taxonomy
  while (<$org_handle>) {
    chomp();
    next unless (/\S/);
    if (/^-- field\s+(\d)	taxonomy/) {
      $taxonomy_field = $1;

    } elsif (/^-- field\s+(\d)	taxid/) { 
      $taxid_field = $1;

    } elsif (/^--/ || /^;/ || /^#/) { #; are required to parse organism.tab created by parse_gtf
      next;

    } else {
      @fields = split "\t";
      $id = $fields[0];

      ## Get taxonomy
      $taxonomy = $fields[$taxonomy_field - 1];

      ## Check if taxid is conform (must be a number)
      $taxid = $fields[$taxid_field - 1];
      if ($taxid =~ /^\d+$/) {
	    $supported_organism{$organism_short_name}->{'taxid'} = $taxid;
	    &RSAT::message::Info($organism_short_name, "TAXID", $taxid) if ($main::verbose >= 3);
      } else {
	    &RSAT::message::Warning($taxid, "Invalid taxid in organism description table",  $organism_table);
	    $taxid = $null;
      }

      &RSAT::message::Info ("Parsed taxonomy from organism.tab", 
			    "id", $id, 
			    "taxid", $taxid, 
			    "taxonomy", $taxonomy) if ($main::verbose >= 3);
    }
  }
  close $org_handle;

  ################################################################
  ## Define default limits of upstream region for retrieve-seq
  ## depending on the taxonomic group
  unless (defined($up_from)) {
    ## If already defined in the file supported-organism.tab, keep
    ## previously defined value (might have been specifically tuned
    ## for some reason).
    if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
      $up_from = $supported_organism{$organism_short_name}->{'up_from'};;
    } else {
      ## Taxon-specific default values for sequence lengths
      if (
	  ($taxonomy =~ /^Bacteria;/) || ($taxonomy =~ /^Archaea;/) ||
	  ($taxonomy =~ /;\s*Bacteria;/) || ($taxonomy =~ /;\s*Archaea;/)
	 ) {
	$up_from=-400;
      } elsif (($taxonomy =~ /^Viruses;/) ||
	       ($taxonomy =~ /;\s*Viruses;/)
	      ) {
	$up_from=-400;
      } elsif ($taxonomy =~ /;\s*Fungi;/) {
	$up_from=-800;
      } elsif ($taxonomy =~ /;\s*Nematoda;/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Insecta;/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Metazoa/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Viridiplantae/) {
	$up_from=-3000;
      } else {
	$up_from=-1000;
      }
    }
  }
  unless (defined($up_to)) {
    if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
      $up_to = $supported_organism{$organism_short_name}->{'up_to'};
    } else {
      $up_to = -1;
    }
  }
  &RSAT::message::Info("Upstream region limits from", $up_from, "to", $up_to) if ($main::verbose >= 3);

  &UpdateConfigTab() unless (($dry) || ($batch));
  return(1);
}

################################################################
## Update the perl config file
sub UpdateConfigPerl {
  my $comment_previous_config = 0;
  if ($local_config) {
    $config_to_update = $ENV{'RSA_LOCAL_CONFIG'} ;
  } else {
    $config_to_update = $ENV{RSAT}."/public_html/data/supported_organisms.pl";
  }

  ## Check if the organism was already installed before
  if (($main::verbose >= 4) && defined($supported_organism{$organism_short_name}->{'genome'})) {
      &RSAT::message::Warning($organism_short_name, "already defined in the config file", $config_to_update) ;
  }
  
  ## read previous config
  open CONFIG, $config_to_update;
  while ($line = <CONFIG>) {
    chomp $line;
    last if ($line =~ /return/);
    if (($line =~ /supported_organism\{\'$organism_short_name\'\}/) && ($line !~ /^\#/)) {
      ## comment the previous config
      if ($comment_previous_config) {
	$previous_config .= "# ${line} # reinstalled on\t${date}\n";
      }
    } else {
	$previous_config .= $line."\n";
      }
  }
  close CONFIG;

  ## Write updated config file
  &RSAT::message::Info ("Updating supported organisms", $config_to_update) if ($main::verbose >= 3);
  open CONFIG, ">$config_to_update" 
    || die "Error: cannot write config file $config_to_update\n";
  print CONFIG $previous_config;
  my $new_org_config = 	 "\n#### $organism_short_name\t$organism_full_name{$organism_short_name}\t$install_date\n";
  $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'name'} = \"$organism_full_name{$organism_short_name}\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'data'} = \"$dir{install}\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'last_update'} = \"".$install_date."\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'source'} = \"$source\";\n";
  ## OLIVIER SAND SHOULD CHECK IF THIS RESTRICTION FOR ensembl IS STILL VALID
  unless ($source =~ 'ensembl') {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'features'} = \"$outfile{features}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'genome'} = \"$outfile{genome}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'seq_format'} = \"filelist\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'taxonomy'} = \"$taxonomy\";\n";
  if (defined($outfile{synonyms})) {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'synonyms'} = \"$outfile{synonyms}\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_from'} = ".$up_from.";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_to'} = ".$up_to.";\n";

  ## Replace absolute paths by relative paths
  ##   $new_org_config =~ s|$ENV{RSAT}\/|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|$ENV{RSAT}|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|\/\/|/|g;

  $download_date = &AlphaDate();
  $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'download_date'} = ".$download_date.";\n";

  
  print CONFIG $new_org_config;
  print CONFIG "\nreturn 1;\n";
  close CONFIG;
}

################################################################
## Update the tab-delimited file describing the list of supported
## organisms
sub UpdateConfigTab {
  my (%args) = @_;
  &RSAT::message::Info ("Updating supported organisms", $main::org_table) if ($main::verbose >= 3);

  ## Update the hash variable for the new organism
  $supported_organism{$organism_short_name}->{'name'} = $args{name} || $organism->get_attribute("name");
  $supported_organism{$organism_short_name}->{'data'} = $args{data} || $dir{install};
  $supported_organism{$organism_short_name}->{'last_update'} = $args{last_update} ||  $install_date;
  $supported_organism{$organism_short_name}->{'genome_assembly'} = $organism->get_attribute("genome_assembly");
  $supported_organism{$organism_short_name}->{'download_date'} =  &AlphaDate() if ($task{download});
  $supported_organism{$organism_short_name}->{'update_date'} =  &AlphaDate();

#  &RSAT::message::Debug($organism,  $organism_short_name, $organism->get_attribute("name"), $install_date) if ($main::verbose >= 10);

  ## Change source if requested
  if (($main::source) && ($main::source ne $null)) {
    $supported_organism{$organism_short_name}->{'source'} = $main::source;
  } elsif (($args{source}) && ($args{source} ne $null)) {
    $supported_organism{$organism_short_name}->{'source'} = $args{source};
  }
 
  $supported_organism{$organism_short_name}->{'features'} = $args{features} || $main::outfile{features};
  $supported_organism{$organism_short_name}->{'genome'} = $args{genome} || $main::outfile{genome};
  $supported_organism{$organism_short_name}->{'seq_format'} = $args{seq_format} || "filelist";
  $supported_organism{$organism_short_name}->{'taxonomy'} = $args{taxonomy} || $main::taxonomy;
  $supported_organism{$organism_short_name}->{'taxid'} = $args{taxid} || $main::taxid;
  if (defined($main::outfile{synonyms})) {
    $supported_organism{$organism_short_name}->{'synonyms'} = $args{synonyms} || $main::outfile{synonyms};
  }

  $supported_organism{$organism_short_name}->{'up_to'} = $args{up_to} || $main::up_to;
  if (($main::up_from) && ($main::up_from ne $null)) {
    $supported_organism{$organism_short_name}->{'up_from'} = $main::up_from;
  } elsif (($args{up_from}) && ($args{up_from} ne $null)) {
    $supported_organism{$organism_short_name}->{'up_from'} = $args{up_from};
  }
 
  ## Update blast column in config table if blast information is available
  my $blast_folder = $supported_organism{$organism_short_name}->{'data'}."/blast_hits" ;
  &RSAT::message::Info($blast_folder) if ($main::verbose >= 2);
  if ( -d $blast_folder ){
    $supported_organism{$organism_short_name}->{'blast_available'}= 1;
  } else {
    $supported_organism{$organism_short_name}->{'blast_available'}= 0;
  }

  ## Update variant columns in config table if variant information
  my $variants_folder=$supported_organism{$organism_short_name}->{'data'}."/variations" ;
  &RSAT::message::Info($variants_folder) if ($main::verbose >= 2);
  if ( -d $variants_folder ){
    $supported_organism{$organism_short_name}->{'variant_available'}= 1;
    my $variant_source = `cat $variants_folder/source.txt`;
    chomp($variant_source);
    $supported_organism{$organism_short_name}->{'variant_source'}= $variants_source || "<NA>";
    $supported_organism{$organism_short_name}->{'path_to_variant_files'}= $variants_folder;
  } else {
    $supported_organism{$organism_short_name}->{'variant_available'}= 0; 
    $supported_organism{$organism_short_name}->{'variant_source'}= "<NA>";
    $supported_organism{$organism_short_name}->{'path_to_variant_files'}= "<NA>";
  }

  ## Update organism configuration table
  &RSAT::message::Debug("&UpdateConfigTab()",
			$supported_organism{$organism_short_name}->{'source'},
			$supported_organism{$organism_short_name}->{'taxid'},
			$supported_organism{$organism_short_name}->{'ID'},
			$supported_organism{$organism_short_name}->{'last_update'},
			$supported_organism{$organism_short_name}->{'up_from'},
			$supported_organism{$organism_short_name}->{'name'},
			$supported_organism{$organism_short_name}->{'variant_available'},
			$supported_organism{$organism_short_name}->{'variant_source'},
			$supported_organism{$organism_short_name}->{'path_to_variant_files'}
      ) if ($main::verbose >= 3);


  ## Export the updated table of supported organisms
  &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table);
}

################################################################
## Rewrite the table of supported organisms on RSAT
sub RewriteConfigTab {
    my $supported_organism_file = &Get_supported_file();
    my $new_file = '';
    if (-f $supported_organism_file) {
        my ($s_o_file) = &OpenInputFile($supported_organism_file);
        
        ## Read supported_organisms_ensembl.tab
        my $data_dir = &Get_data_dir();
        open ($fen, $data_dir."/supported_organisms_ensembl.tab");
        my %ensembl = ();
        while (my $row = <$fen>){
            chomp($row);
            my @fields = split("\t", $row);
            $ensembl{$fields[0]} = "\t".$fields[2]."\t".$fields[4]."\t".$fields[5];
            #print $fields[0];
        }
        close($fen);
        
        my %blast = ();
        my %variations = ();
        
        while(<$s_o_file>){
            my @fields = split("\t");
            my $key = $fields[0];
            if(-d $data_dir."/genomes/".$key."/blast_hits"){
                $blast{$key} = 1;
            }else{
                $blast{$key} = 0;
            }
            $variations{$key} = ();
            if(-d $data_dir."/genomes/".$key."/variations"){
                $variations{$key}{"variant_available"} = 1;
                my $variant_source = `cat $variants_folder/source.txt`;
                chomp($variant_source);
                $variations{$key}{"source"} = $variant_source || "<NA>";
                $variations{$key}{"path"} = &RSAT::util::hide_RSAT_path($data_dir."/genomes/".$key."/variations");
            }else{
                $variations{$key}{"variant_available"} = 0;
                $variations{$key}{"source"} = "<NA>";
                $variations{$key}{"path"} = "<NA>";
            }
        
        }
        close($s_o_file);
        my ($s_o_file) = &OpenInputFile($supported_organism_file);
        ## Read the whole file of supported organisms from ensembl,
        ## and store species description lines in a hash indexed by
        ## full species ID, in order to sort them after having changed
        ## the current species fields.
        open($fnew, ">", $data_dir."/supported_organisms_new.tab");
        while (my $row = <$s_o_file>) {
            next if (/^;/); ## Skip comment lines
            if($row =~ /^#/){
                my @fields = split("\t");
                print $fnew "#ID\tname\ttaxid\tsource\tlast_update\tnb\tseq_format\tup_from\tup_to\ttaxonomy\tdata\tgenome\tgenome_assembly\tgenome_version\tdownload_date\tvariant_available\tvariant_source\tpath_to_variant_files\tblast_available\n";
            }else{
                chomp($row);
                my @fields = split("\t", $row);
                my $full_species_id = $fields[0];
                #check if the species is already in the list
                my $newline = $row;
                if(defined $ensembl{$full_species_id}){
                    $newline .= $ensembl{$full_species_id};
                }else{
                    $newline .= ("\t<NA>")x3;
                }
                $newline .= "\t".$variations{$full_species_id}{"variant_available"}."\t".$variations{$full_species_id}{"source"}."\t".$variations{$full_species_id}{"path"}."\t".$blast{$full_species_id}."\n";
                print $fnew $newline;
            }
        }
        close $s_o_file;
    	system("mv " . $data_dir . "/supported_organisms_new.tab " . $data_dir . "/supported_organisms.tab");
    }
}

################################################################
## Create a directory for the taxonomic group of the organism
sub CreatePhylogeny {
    my $taxonomy = "";
    &RSAT::message::Warning("Phylogeny indexing has been disabled");
    return;

    ## read taxonomy from the parsing result
    my $organism_table = $dir{install}."/genome/organism.tab";

    ($org_handle) = &OpenInputFile($organism_table);
    while (<$org_handle>) {
        chomp;
        if (/^-- field (\d)	taxonomy/) {
            $taxonomy_field = $1;
        } elsif (/^--/) {
            next;
        } else {
            if (defined($taxonomy_field)) {
                @fields = split "\t";
                $taxonomy = $fields[$taxonomy_field - 1];
                &RSAT::message::Info ("Taxonomyyy\t$taxonomy\n") if ($main::verbose >= 3);
            } else {
                &Warning("Cannot read taxonomy in file $organism_table\n");
            }
        }
    }
    close $org_handle;

    if ($taxonomy) {
	$taxonomy = &trim($taxonomy);
	$taxonomy =~ s|\s*;\s*|/|g; ## Each taxonomic level becomes a subdirectorry
	$taxonomy =~ s|\s+|_|g; ## I prefer to avoid spaces in directory names
	$taxonomy =~ s|\(|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\)|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\,|.|g; ## Not fatal, but usually not found in folder names.
	$taxonomy =~ s|\:|.|g; ## Not fatal, but usually not found in folder names.
	$dir{taxonomy} = $ENV{RSAT}."/public_html/data/phylogeny/".$taxonomy;
	$dir{taxonomy} =~ s|//|/|g;
	my ($org_dir) = &ShortFileName($dir{install});
	&RSAT::util::CheckOutDir($dir{taxonomy}) unless ($dry);
	if ($main::verbose >= 3) {
	    &RSAT::message::Info("Taxonomy directory", $dir{taxonomy});
	    &RSAT::message::Info("Organism directory", $org_dir);
	    &RSAT::message::Info("Link to directory", $dir{install});
	}
	&doit("cd $dir{taxonomy}; rm $org_dir; ln -s $dir{install} .",0,0,$main::verbose);
    } else {
	&RSAT::message::Warning("Cannot identify taxonomy in table ".$organism_table);
    }

}

################################################################
### extract the non-redudant set of intergenic and gene sequences
sub IntergenicSegments {
  &RSAT::message::TimeWarn("&IntergenicSegments()") if ($main::verbose >= 3);

  chdir $dir{genome};

  ## Retrieve sequences for different subtypes of genome regions
  &RSAT::message::TimeWarn("coding-or-not: extracting sequences for different types of genomic regions") if ($main::verbose >= 2);
  my $command = "coding-or-not ";
  $command .= "-v 1 " if ($main::verbose >= 1);
  $command .= "-org $organism_short_name -return ncs,cs,pos,seq,stats";
#  &doit($command, $dry, $die_on_error, $main::verbose);

  my @types = ();
  push @types, "gene";
  push @types, "intergenic";

  foreach my $seq_type (@types) {
    my $seq_file = $dir{genome}."/".$organism_short_name."_".$seq_type."_segments.fasta";

    ## Draw sequence length distributions
    &RSAT::message::TimeWarn("Computing sequence length for genomic regions", $seq_type) if ($main::verbose >= 2);
    if ($task{seq_len_distrib}) {
      $command .= "; " unless ($command eq "");
      $command .= &SeqLengthDistributionCmd($seq_file, "fasta", $seq_type, 50, 2000);
    }

    ## Compress sequence file
    if (-e $seq_file) {
      &RSAT::message::TimeWarn("Compressing sequences for genomic regions", $seq_type) if ($main::verbose >= 2);
      $command .= "; " unless ($command eq "");
      $command .= "gzip -f $seq_file";

      ## Purge sequences
      if ($purged_frequencies) {
	&RSAT::message::TimeWarn("Purging sequences for genomic regions", $seq_type) if ($main::verbose >= 2);
	$command .= "; " unless ($command eq "");
	$command .= &PurgeSeqCommand($seq_file);
      }
#      &doit($command, $dry, $die_on_error, $main::verbose);
      &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
    }

  }

  chdir $dir{main};
}


################################################################
### extract the complete set of upstream sequences
sub AllUpstream {
  &RSAT::message::TimeWarn("Retrieving all upstream sequences") if ($main::verbose >= 2);

  foreach my $masking (@masking_modes) {
    foreach my $noorf ("", "-noorf") {
      my $seq_type = "upstream${noorf}${masking}";
      my $seq_file = $dir{genome}."/".${organism_short_name}."_".${seq_type};
      my @seq_formats = qw(fasta ft);
      #      my @seq_formats = qw(ft);
      my $command = "";

      ## Retrieve sequences in two formats: fasta is used for
      ## computing oligo and dyad frequencies, whereas ft is convenient for
      ## retrieve-seq-multigenome
      foreach $format (@seq_formats) {
	$command .= "; " unless ($command eq ""); 
	$command .= "retrieve-seq -all";
	$command .= " -type upstream -feattype gene -org ".$organism_short_name;
	$command .= " ".${noorf};
	$command .= " ".${masking};
	$command .= " -label ID";
	$command .= " -format ".$format;
	$command .= " -o ".$seq_file.".".$format;
	&RSAT::message::Info("Exported sequence file", $seq_file.".".$format) if ($main::verbose >= 3);
      }

      ## Compute sequence length distributions and generate a frequency plot
      $command .= "; ".&SeqLengthDistributionCmd($seq_file.".fasta", "fasta", $seq_type, 50, 2000) if ($noorf);

      ## Purge sequences
      if ($purged_frequencies) {
	$command .= "; ";
	$command .= &PurgeSeqCommand($seq_file.".fasta", "fasta");
      }

      ## Compress sequence file
      $command .= "; gzip -f ".$seq_file.".fasta";

      &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  

    }
  }
  chdir $dir{main};
}


################################################################
## Convert genome sequence to fasta, required for retrieve-seq-bed and
## bedtools, and generate the fasta index file (.fai).
sub FastaGenomeAndBedtoolsIndex {
  &RSAT::message::TimeWarn("Generating fasta sequences and bedtools fasta index. ") if ($main::verbose >= 2);

  my $command = "";

  ## Generate fasta sequence
  if ($task{fasta_genome}) {
    $command .= "convert-seq";
    $command .= " -i ".$outfile{genome};
    $command .= " -from filelist -to fasta";
    $command .= " -o ".$outfile{fasta_genome};
#    &doit($command, $dry, $die_on_error, $main::verbose);
    &RSAT::message::Info("\tFasta genome", $outfile{fasta_genome}) if ($main::verbose >= 3);

    ## JvH (2017-11-02): I don't remember why I made a soft link to
    ## dna.genome.fa instead of generating the repeat-masked genome.
#    $command .= "; cd ".$dir{genome}. "; ln -f -s ".$organism_short_name.".dna.genome.fa ".$organism_short_name.".dna_rm.genome.fa; cd -";

#    &doit($command, $dry, $die_on_error, $main::verbose);
  }

  ## Convert soft-masked genome into hard-masked genome
  if ($task{fasta_genome_rm}) {
    $command .= "; convert-seq";
    $command .= " -i ".$outfile{fasta_genome};
    $command .= " -from fasta -to fasta -mask lower";
    $command .= " -o ".$outfile{fasta_genome_rm};
  }

  ## Compute chromosome sizes
  if ($task{chrom_sizes}) {
    $command .= "; " if ($command);
    $command .= "sequence-lengths -i ".$outfile{fasta_genome};
    $command .= " -o ".$outfile{chrom_sizes};
    &RSAT::message::Info("\tChromsome sizes", $outfile{chrom_sizes}) if ($main::verbose >= 3);
  }

  ################################################################
  ## Generate bedtool indexes. This must be done as RSAT user in order
  ## to enable retrieve-seq-bed, which relies on bedtools getfasta.
  ## 
  ## We use a trick to run bedtools getfasta with a single-nucleotide bed
  ## file, whihc creates the fasta index if it does not exist. .
  if ($task{index_bedtools}) {
    my $rand_fragment_coord = &RSAT::util::make_temp_file("","random-genome-fragments_coord", 1);
    my $rand_fragment_seq = &RSAT::util::make_temp_file("","random-genome-fragments_seq", 1);
    $command .= "; " if ($command);
    $command .= &RSAT::server::GetProgramPath("random-genome-fragments");
    $command .= " -n 1 -l 1";
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_coord;
    $command .= " ; ";
    $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed");
    $command .= " -i ".$rand_fragment_coord;
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_seq;
    $command .= " ; ";
    $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed");
    $command .= " -rm";
    $command .= " -i ".$rand_fragment_coord;
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_seq;
    $command .= " ; rm -f ".$rand_fragment_seq; 
    $command .= " ; rm -f ".$rand_fragment_coord; 
    &RSAT::message::Info("\tFasta index", $outfile{fasta_genome}.".fai") if ($main::verbose >= 3);
    #&doit($command, $dry, $die_on_error, $main::verbose);
  }

  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
}


################################################################
## Draw an histogram of sequence lengths
sub SeqLengthDistributionCmd {
    my ($seq_file, $format, $seq_type, $ci, $xmax) = @_;
    &RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $format, $seq_type, $ci, $xmax) if ($main::verbose >= 2);
    $ci = 50 unless $ci;
    $format = "fasta" unless $format;
    my $classfreq_from = 0;
    chdir $dir{genome};

    if ($main::verbose >= 4) {
	my $pwd = $ENV{PWD};
#2D	my $pwd = `pwd`;
#	chomp $pwd;
	&RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $seq_type, "Working dir", $pwd);
    }

    my $len_prefix = $dir{genome}."/".$organism_short_name."_".$seq_type."_lengths";
    my $command = "sequence-lengths -i $seq_file -in_format ".$format;
    $command .= " -o ${len_prefix}.tab";
    $command .= "; classfreq -v 1 -i ${len_prefix}.tab ";
    $command .= " -col 2 -ci ".$ci." -from ".$classfreq_from;
    $command .= " -o ".$len_prefix."_distrib.tab";
    $command .= "; XYgraph -i ${len_prefix}_distrib.tab";
    $command .= " -xcol 3 -ycol 4,5,6 -lines -xmin $classfreq_from";
    $command .= " -xsize 800 -ysize 400 -legend";
    $command .= " -xgstep1 200 -xgstep2 50 -xmin 0";
    $command .= " -xmax ".$xmax;
    $command .= " -ymin 0 -ygstep2 1000";
    $command .= " -xlog 2" if( ($seq_type eq "intergenic") || ($seq_type eq "gene"));
    $command .= " -xleg1 'Sequence length (bp)'";
    $command .= " -yleg1 'Frequency'";
    $command .= " -title1 '".$organism_full_name{$organism_short_name}."'";
    $command .= " -title2 'length distribution of ${seq_type} sequences'";
    $command .= " -format ".$img_format;
    $command .= " -o ".$len_prefix."_distrib.".$img_format;
#    &doit($command, $dry, $die_on_error, $main::verbose);
    return($command);
}

################################################################
## purge sequences
sub PurgeSeqCommand {
  my ($seq_file, $format) = @_;
  &RSAT::message::Warning("Purging sequence", $seq_file, $format) if ($main::verbose >= 2);
  $format = "fasta" unless ($format);
  my $purged_seq_file = `basename $seq_file $format`;
  $purged_seq_file .= "_purged.$format";
  my $command = "purge-sequence -i $seq_file -format $format -ml 300 -mis 9 -2str -o $purged_seq_file";
#  &doit($command, $dry, $die_on_error, $main::verbose);
  ## compress purged sequence file
  $command .= "; gzip -f $purged_seq_file";
#  &doit($command, $dry, $die_on_error, $main::verbose);
  return($command);
}

################################################################
### calculate oligo and dyad frequencies in different sequence types :
### - intergenic
### - upstream
### - genomic
### 
sub CalcFrequencies { 
  #    my ($seq_file, $seq_format, $seq_type) = ();
  &RSAT::message::TimeWarn("&CalcFrequencies()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2);
  chdir $dir{oligos};

  ################################################################
  ## Calculate oligonucleotide and dyad frequencies in all upstream
  ## sequences
  if ($task{upstream_freq}) {
    foreach my $masking (@masking_modes) {
      foreach my $noorf ("-noorf", "") {
	$seq_type = "upstream${noorf}${masking}";
	$seq_format = "fasta";
	$seq_file = $dir{genome}."/".$organism_short_name."_".$seq_type;
	if ($purged_frequencies) {
	    $seq_file .= "_purged";
	}
	$seq_file .= ".fasta";
	
	unless (-e $seq_file) {
	    if (-e $seq_file.".gz") {
		$seq_file = $seq_file.".gz";
	    } else {
		&RSAT::error::FatalError("Cannot compute upstream frequencies because upstream sequence file is missing:\t", $seq_file);
	    }
	}

	&RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies", $noorf, $masking, $seq_file) if ($main::verbose >= 2);
	&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
	&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
      }
    }
  }

  ################################################################
  ## Specific tasks for genomes downloaded from Ensembl
  if ($task{ensembl_freq}) {

    unless (defined($up_from)) {
      if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
	$up_from = $supported_organism{$organism_short_name}->{'up_from'};;
      }
    }

    unless (defined($up_to)) {
      if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
	$up_to = $supported_organism{$organism_short_name}->{'up_to'};
      }
    }

    foreach my $masking (@masking_modes) {
      #	foreach my $masking ("-rm") {
      #	    foreach my $noorf ("-noorf", "") {
      foreach my $maskcoding ("-maskcoding") {
	#		    foreach my $type ("upstream_mrna") {
	#		    foreach my $type ("intron") {
	#		    foreach my $type ("firstintron") {
	foreach my $type ("utr") {
	  #			$seq_type = "${type}${up_from}${up_to}${maskcoding}${masking}";
	  $seq_type = "${type}${maskcoding}${masking}";
	  #		    if ($purged_frequencies) {
	  #			$seq_file = "${organism_short_name}_${seq_type}_purged.fasta";
	  #			$seq_format = "fasta";
	  #		    } else {
	  $seq_file = "${organism_short_name}_${seq_type}.fasta";
	  $seq_format = "fasta";
	  #		    }
	  &RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies") if ($main::verbose >= 1);
	  &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
	  #		    &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
	  #		    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
	}
	#	    }
      }
    }
  }


  ################################################################
  ## Calculate oligont frequencies in all intergenic sequences
  if ($task{intergenic_freq}) {
    $seq_type = "intergenic";
    &RSAT::message::TimeWarn("Calculating ${seq_type} oligo and dyad frequencies") if ($main::verbose >= 1);
    if ($purged_frequencies) {
      $seq_file = "${organism_short_name}_intergenic_segments_purged.fasta";
      $seq_format = "fasta";
    } else {
      $seq_file = "${organism_short_name}_intergenic_segments.fasta";
      $seq_format = "fasta";
    }
    &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
  }
    
  ################################################################
  ## Calculate oligo-frequencies in full genome
  if ($task{genome_freq}) {
    $seq_type = "genomic";

    ## First check if a fasta file exists for the current genome
    ## (since 2015-11, fasta genomes are downloaded from
    ## ensemblgenomes). If yes, use it in priority because it allows
    ## to use the optiojn -quick for oligo-analysis and dyad-analysis.
    my $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name."/genome";
    my $fasta_genome = $genome_dir."/".$organism_short_name.".dna.genome.fa";
    if (-e $fasta_genome) {
	$seq_file = $fasta_genome;
	$seq_format = "fasta";
	&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
	#	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib});
	&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
    } else {
	&RSAT::message::Warning("Missing fasta genome file", $fasta_genome);
	&RSAT::message::Warning("Fasta genome file not found -> skipping whole genome oligo/dyad frequencies.");
	$seq_format = "filelist";
	$seq_file = $outfile{'genome'};
    }
    #&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib});
    #&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
    
  }

  ################################################################
  ## calculate oligont frequencies in all gene sequences 
  ##
  ## GENE FREQUENCIES ARE NOT WORKING ANYMORE, AND, BESIDES, THEY WERE NOT
  ## USEFUL
  if ($task{gene_freq}) {
    $seq_type = "gene";
    if ($purged_frequencies) {
      $seq_file = "${organism_short_name}_gene_segments_purged.fasta";
      $seq_format = "fasta";
    } else {
      $seq_file = "${organism_short_name}_gene_segments.fasta";
      $seq_format = "fasta";
    }
    &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
  }


  
  ################################################################
  # Calculate oligopeptide frequencies in all protein sequences
  if ($task{protein_freq}) {
      $seq_file = $peptidic_seq_file;
      if (-e $peptidic_seq_file) {
	  $seq_type = "protein";
	  $seq_format = "fasta";
	  &RSAT::message::TimeWarn("Calculating oligopeptide frequencies") if ($main::verbose >= 2);
	  &CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
      } else {
	  &RSAT::message::Warning("Skipping oligopeptide frequencies because peptidic sequence file is missing", $seq_file);
      }
  }
  
  chdir $dir{main};
}

################################################################
## Calculate oligonucleotide frequencies for a specified sequence
## file
sub CalcOligoFreq {
  my ($seq_file, $seq_format, $seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcOligoFreq()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2);
  unless ($seq_file) {
      &RSAT::error::FatalError("&CalcOligoFreq() called with empty sequence file name");
  }
  unless (-e $seq_file) {
      &RSAT::error::FatalError("&CalcOligoFreq(): sequence file does not exist\t", $seq_file);
  }
  
  my @current_oligo_lengths = @oligo_lengths;
#  my @current_oligo_lengths = (6);
  my $oligo_seq_type = "dna";
  my $residue_type = "nt";
  my $quick = "";
  my @batch_commands = ();

  my @strands = ("-1str", "-2str");
  if ($seq_type eq "protein") {
    @current_oligo_lengths = 1..3;
    $oligo_seq_type = "prot";
    $residue_type = "pept";
    @strands = ("");
  }

  ## Quick option only works with DNA sequences in fasta format
  unless (($seq_type eq "protein") || ($seq_format ne "fasta")) {
    $quick = "-quick";
  }
  
  ## Uncompress sequence file if required because the option -quick
  ## currently does not support compressed files
  my $uncompressed = 0;
  if (!(-e $seq_file) && (-e $seq_file.".gz")) {
      &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2);
      my $command = "gunzip ".$seq_file.".gz";
      my $job_prefix = ${organism_short_name}."_compress";
      $uncompressed = 1;
      &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      $seq_file =~ s/\.gz$//;
  }

  my $job_prefix = ${organism_short_name}."_oligos";
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@current_oligo_lengths) {
	&RSAT::message::TimeWarn("Calculating oligo frequencies",$seq_file, $seq_format, $seq_type, "l=".$oligo_length, $noov, $strands) if ($main::verbose >= 2);

	my $out_file = $dir{oligos}."/".${oligo_length}.${residue_type}."_".${seq_type}."_".${organism_short_name}.${noov}.${strands}.".freq";
	my $command = &RSAT::server::GetProgramPath("oligo-analysis");
	$command .= " ".$quick;
	$command .= " -v 1 ";
	$command .= " ".$strands;
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " -seqtype ".$oligo_seq_type;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna ";
	$command .= " -return freq,occ";
	$command .= " -o ".$out_file;
	$command .= "; gzip -f ".$out_file;
	if ($batch) {
	    push @batch_commands, $command;
	} else {
	    &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
	}
      }
    }
  }

  ## Run all the oligo frequencies in one batch job
  if ($batch) {
      my $batch_command = join(" ;", @batch_commands);
      &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
  }

  ## Recompress the sequence file if required
  if ($uncompressed) {
      my $command = "gzip ".$seq_file;
      my $job_prefix = ${organism_short_name}."_uncompress";
      if ($batch) {
	  &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2);
      } else {
	  &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2);
	  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      }
  }
}

################################################################
## Calculate oligonucleotide distribution for a specified sequence
## file
sub CalcOligoDistrib {
  my ($seq_file,$seq_format,$seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcOligoDistrib()", $seq_file,$seq_format,$seq_type) if ($main::verbose >= 2);
  @strands = ("-1str", "-2str");
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@oligo_lengths) {
	my $job_prefix = ${organism_short_name}."_oligo_".$oligo_length;

	#### Calculate occurrence distributions in the sequence file
	my $distrib_file = $dir{oligos}."/"."${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_distrib.tab";
	my $command = &RSAT::server::GetProgramPath("oligo-analysis");
	$command .= " -v 1";
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna";
	$command .= " -return occ -distrib";
	$command .= " -o ".$distrib_file;
	$command .= " ; gzip -f ".$distrib_file;
	&doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  

	#### Fit a Poisson and a negbin on occurrence distribution
	foreach my $theor ("negbin", "poisson") {
	  my $fitting_file = "$dir{oligos}/${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_${theor}.tab";
	  $command = "fit-distribution -v 1 -i ".$distrib_file;
	  $command .= " -o ".$fitting_file;
	  $command .= " -distrib ".$theor;
	  $command .= " ; gzip -f ".$fitting_file;
	  &doit($command, $dry, $die_on_error, $main::verbose);  
	}
      }
    }
  }
}


################################################################
## calculate dyad frequencies for a specified sequence
## file
sub CalcDyadFreq {
  my ($seq_file, $seq_format, $seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcDyadFreq()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2);
  $min_spacing = 0;
  $max_spacing = 20;
  @monad_lengths = (3,2,1);
  @strands = ("-1str", "-2str");
  my $quick = "";
  my @batch_commands = ();

  ## Quick option only works with DNA sequences in fasta format
  unless (($seq_type eq "protein") || ($seq_format ne "fasta")) {
    $quick = "-quick";
  }

  ## Uncompress sequence file if required because the option -quick
  ## currently does not support compressed files
  my $uncompressed = 0;
  if (!(-e $seq_file) && (-e $seq_file.".gz")) {
      &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2);
      my $command = "gunzip ".$seq_file.".gz";
      my $job_prefix = ${organism_short_name}."_uncompress";
      $uncompressed = 1;
      &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      $seq_file =~ s/\.gz$//;
  }

  ## Compute dyad frequencies for each sequence type, strand grouping (on or off) and monad length
  my $job_prefix = ${organism_short_name}."_dyads";
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $monad_length (@monad_lengths) {
	&RSAT::message::TimeWarn("Calculating dyad frequencies",$seq_file, $seq_format, $seq_type, "l=".$monad_length, $noov, $strands) if ($main::verbose >= 2);
	$dyad_file = $dir{oligos}."/"."dyads_${monad_length}nt_sp${min_spacing}-${max_spacing}_${seq_type}_${organism_short_name}${noov}${strands}";
	$dyad_file .= ".freq";
	my $command = &RSAT::server::GetProgramPath("dyad-analysis");
	$command .= " ".$quick;
	$command .= " -v 1";
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " -timeout 240000 ";
	$command .= " -type any -seqtype dna";
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -sp ".$min_spacing."-".$max_spacing;
	$command .= " -l $monad_length";
	$command .= " -return freq,occ";
	$command .= " -o ".$dyad_file;
	$command .= "; gzip -f ".$dyad_file;
	if ($batch) {
	    push @batch_commands, $command;
	} else {
	    &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);
	}
      }
    }
  }

  ## Run all the oligo frequencies in one batch job
  if ($batch) {
      my $batch_command = join(" ;", @batch_commands);
      &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
  }

  ## Recompress the sequence file if required
  if ($uncompressed) {
      my $command = "gzip ".$seq_file;
      my $job_prefix = ${organism_short_name}."_uncompress";
      if ($batch) {
	  &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2);
      } else {
	  &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2);
	  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      }
  }
}


################################################################
## Display full help message
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	install-organism

AUTHOR
        Jacques van Helden (Jacques.van-Helden\@univ-amu.fr)

USAGE
        install-organism -org organism_name

DESCRIPTION
	Install one or several organisms on the local RSAT instance.

	This script is a task manager, which (depending on the
	selected tasks) manages different steps necessary for the
	installation of an organism from the NCBI flat files:

	- Downloading genomes from the NCBI/Refseq ftp site via the
          rsync protocol
          (rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq).

	- Parsing the .gbff files (Genbank-formatted flat files).

	- Adding the organism in the config file.

	- Calculating trinucleotide frequencies in the start and stop
          codons (a way to check consistency of the gene locations).

	- Calculating oligonucleotide and dyad frequencies, which will
          be used as background models for motif discovery.

CATEGORY
	Data management.

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose

    MANDATORY ARGUMENTS

    	The only mandatory option is to specify one or more
    	organism(s) to be installed. This can be done with any of the
    	following options: -org, -org_file, -all_organisms, -taxon.

	-org	organism name without spaces (e.g. Saccharomyces_cerevisiae)

		The option -org can be used iteratively on the same
		command line to iterate the installation over multiple
		organisms.

        -org_file organism_file_name
		Text file containing a list of organisms to install.
		The first word of each row is taken as a query
		orgnanism. Further information of the same row is
		ignored.

	-all_organisms

		Install all the organisms found in the Refseq
		directory (see option -refseq).

      	-taxon  taxon name (mutually exclusive with -org)

		The installation will iterate over all organisms of
		the selected taxon. Note that the command will only
		apply to the organisms previously declared with the
		command install-organism -task config. The option
		-taxon is thus convenient for re-running installation
		tasks on previously installed organisms rather than
		for installing new genomes downloaded e.g. from NCBI.

		The option -taxon can be used iteratively on the same
		command line to iterate the installation over multiple
		taxa.

    ARGUMENTS TO FILTER ASSEMBLIES

        -filter field value

		Filter assemblies based on specitied values for one or
		several fields, in order to select a subset of
		relevant assemblies for installation.

		This argument can be used iteratively to specify
		several filtering fields. 

		Example 1: list the organisms labelled as reference
		genomes

		install-organism -v 1 -group bacteria  \\
                    -task available,list \\
		    -list_format org   \\
		    -filter  refseq_category "reference genome" \\
		    -o reference_genomes.txt

		Example 2: select major releases of complete genomes
		belonging to the category "representative".

		install-organism -v 1 -group bacteria  \\
                    -task available,list \\
		    -filter refseq_category "representative genome" \\
		    -filter assembly_level "Complete Genome" \\
		    -list_format table \\
		    -o representative_complete.tsv

		Example 3: same as before, but export the result as a
		bash file to run the installations. 

		install-organism -v 1 -group bacteria  \\
                    -task available,list \\
		    -filter  refseq_category "representative genome" \\
		    -filter assembly_level "Complete Genome" \\
		    -list_format bash \\
		    -o install_representative_complete.bash

        -list_format

		Format to export the list of selected assemblies /
		organisms.

		Supported list formats:

		org: organisms in RSAT format (concatenation of
		Genus, species, assembly_accession and asm_name,
		separated by underscores). One organism per row.

		table: table with one row per assembly/organism and
		selected output columns.

		bash: bash script with the commands to download and
		install the selected organisms.


    SPECIFIC ARGUMENTS FOR THE DOWNLOAD AND PARSE TASK

        -group taxonomic group according to the NCBI/Refseq ftp/rsync
	       site (ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq). 

	       Examples: bacteria, fungi, protozoa, plant,
	       invertebrate, vertebrate_mammalian, vertebrate_other

	-species species name, with space replaced by underscore
	         (e.g. Saccharomyces_cerevisiae). 

		 Note that one species may correspond to several
	         organisms if several strains are available for this
	         species. In particular, for some bacteria there are
	         several thousands of sequenced strains
	         (e.g. Escherichia coli). 

		 The option -max_strains can be used to restrict the
	         number of installed strains per species. 

		 An alternative is to specify a custom list of species
		 + strains with the option -org_table (this is the
		 recommended way to install multiple genomes from
		 NCBI).

	-species_file file containing species names in the first
	        column.

	-max_strains #

	        Maximal number of strains per species to
		parse/install.  This option avoids filling disk with
		particular bacterial species for which thousands of
		strains have been sequenced (e.g. Escherichia coli).

	-strain restricts the download and installation to a single
                assembly. By default all strains of the selected
                species are downloaded, but this can be heavy for some
                bacterial species where thousands of strains have been
                sequenced (e.g. Escherichia coli).

	      Note: the RSAT organism name is automatically built by
	      concatenating the NCBI genus + species name + assembly
	      accession + assembly name.

	-assembly_table

	        Table extracted from the NCBI ftp site, indicating the
	        organisms to install. 

		Example: assembly table of fungi:

	        ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt

		The table can be the full table for a given group, or
		a custom subset, which can be selected on some
		informative fields of the assembly table. 

                We recommend to select the subset of rows having
		either "Complete genome" or "Chromosome" in the 12th
		column ("assembly_level"), and to add a filter on
		"refseq_category" in order to select "representative
		genome" or "reference genome". 

		The assembly table can also be filtered out manually
		in order to select a custom subset of the available
		genomes.

	-refseq
		Local directory containing the mirror of the refseq
		genomes found on the NCBI site:
		       ftp://ftp.ncbi.nih.gov/genomes/refseq
		Normally, the refseq directory is specified by
		defining a global variable REFSEQ_DIR in the config
		file. The option -refseq allows to overwrite this
		value.


    OPTIONAL ARGUMENTS

	-skip #	Skip the first # organisms of the list.

	        This option is convenient to resume the installation
	        of a list of organisms, when it has been interrupted.

		It also allows to organize the installation by chunks
		of organisms.

	-last #	Stop after the first # organisms of the list.

		This option can be used to organize the installation
		by chunks of organisms.

	-organism Full name of the organism (e.g. 'Saccharomyces
		cerevisiae'). This option is only valid when a single
		organism is installed.

	-source	data source

	-dir dir{install}
		Absolute path of the installation directory. 
		BEWARE : you should provide the absolute path of the
		installation directory, not the relative path.

	-batch run some tasks (for example the calibration of oligos
		and dyads) in batch mode or send them to a job
		scheduler.  The configuration of the job scheduler /
		batch mode should be specified with configure_rsat.pl.


	-batch_org when multiple organisms have to be installed, send
		each organism as a separate job to a job
		scheduler. The configuration of the job scheduler
		should be specified with configure_rsat.pl.

	-queue  specify a queue to send the job with the option -batch. 

	        On this instance of RSAT, default queue is $ENV{CLUSTER_QUEUE}

                Type 'qstat -f' to get a list of existing queues.

	-config
		Specify an alternative organism configuration file for the
		genome to be installed.

		By default, the organism configuration file is 
	      	\$RSAT/public_html/data/genomes/supported_organisms.pl

        -backup_org_table
                Store a backup copy of the organism table before
                overwriting it.

	-local
		Absolute path of a RSA local config file.

		By default, the newly installed organism is added to
		the main RSA config file is changed (provided the user
		has write access to the RSA config file).

		In addition to the organisms installed by the RSAT
		system administrator (found in
		$ENV{RSAT}/public_html/data/supported_organisms.tab), users can
		install some organisms locally.

		For this, the user must first define an environment
		variable called RSA_LOCAL_CONFIG, and indicating the
		absolute path of the local config file.  
		E.g.  
		  export RSA_LOCAL_CONFIG=/home/fred/RSA.local.config

		When install-organism is called with the option
		-local, the new organism is added to the file
		indicated by the environment variable RSA_LOCAL_CONFIG
		rather than the main RSA config file.

	-syn	synonym table
		A tab-delimited file containing two coloumns. The
		forst column contains a gene ID, the second a gene
		name.

	-up_from distal limit of the upstream regions (e.g. -800 for yeast)

	-up_to	proximal limit of the upstream regions (e.g. -1)

	-genbank genbank directory (obsolete, see refseq)

	-prefid feattype idname
	        passed to parse-genbank.pl

	-date last_update

	        Force the 'last_update' attribute to a given date. 

		This option is used by download-organism to ensure
		that the local genome has the same installation date
		as the server, rather than using the date of download
		as update date.

	-ensembl
		ENSEMBL directory. Directory containing the ENSEMBL
		flat files in Genbank format (ext .dat)

		Example: 
		ftp.ensembl.org/pub/current_worm/data/flatfiles/genbank

	-task	specification of a single installation task
		    e.g.
			install-organism -task dyads
		supported tasks: $supported_tasks

		Description of the tasks
		------------------------

		available    list available species on the NCBI FTP site.
		             This task cannot be combine with any
		             other task.

		download     download genome from the NCBI FTP site
		             (ftp://ftp.ncbi.nih.gov/genomes/refseq/)
		             via the rsync protocol.

		list         print the list of downloaded strains for the
		             user-specified species.

		parse        parse genome data (sequence, features) from 
		             genbank-formatted flat files (gbff)
		             downloaded from NCBI. Parsing is ensured
		             by the program parse-genbank.pl.

		config	     update configuration file

		start_stop   
			     calculate start and stop codon
			     frequencies

		allup	     retrieve all upstream sequences

		genome_segments
			     retrieve sequences and limits of genome segments
			     (intergenic, genic)

		oligos	     calculate oligonucleotide frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which oligo frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		dyads	     calculate dyad frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which dyad frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		ncf	     calculate oligo and dyad frequencies in
			     intergenic segments

		upstream_freq
			     calculate oligo and dyad frequencies for
			     all upstream sequences

		intergenic_freq
			     calculate oligo and dyad frequencies for
			     all intergenic sequences

		genome_freq  
			     calculate oligo and dyad frequencies for
			     the whole genome sequence. This is not
			     recommended for higher organisms, where
			     the genome represents several Gigabases,
			     and the computation of all oligo and dyad
			     frequencies might take ages.

		protein_freq
			     calculate amino acid frequencies for all
			     peptidic sequences. 

		protein_seq
			     remove empty rows present in the peptidic
			     sequence files dowloaded from NCBI,
			     because the fact that the first row of
			     these files is empty raises a bug with
			     diamond (highly efficient implementation
			     of blast used by genome-blast)

		clean	     remove unnecessary sequence files

	-rm	calibrate oligo and dyad frequncies on repeat masked
		sequences, in addition to the non-masked sequences.

	-img_format
		image format for the graphs of sequence length distribution

EXAMPLES

    Get the list of available genomes for one of the supported groups
    on the NCBI FTP/rsync site.  Supported values for option -group
    are the subdirectories of the FTP site
    (rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq).

	install-organism -v 1 -group [GROUP]  -task available \
            -o available_vertebrate.tab

    Download one species with rsync:

    	install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task download

    Parse the genome for a given species, and declare it in RSAT
    supported organism. NOTE: this installs all the strains for the
    selected species. For some species this can represent thousands of
    strains (e.g. Eschrichia coli). Strains can be restricted with the
    options -strain or -max_strains.

        install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task parse,config

    Run the default installation steps for a given species. Note: the
    option -list is required in order to collect the organism names,
    which are made by concatenating species and strain

        install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task list,default

    Installation can be automated and parallelized with a job
    scheduler (e.g. qsub) in order to install all the species of a
    given group.

    Example: install all species of the group "fungi" at NCBI.

    Step 1: get the list of species available at NCBI

        install-organism -v 1 -group fungi -task available \
	    -o fungi_available_species.txt

    Step 2: download genome for all the strains of these
    species. Beware, this takes space, there are several hundreds of
    species.

	install-organism -v 1 -group fungi -task download \
    	    -species_file fungi_available_species.txt

    Step 3: parse the genomes of all strains for each fungal species.

	install-organism -v 1 -group fungi -task parse,config \
    	    -species_file fungi_available_species.txt

    Step 4: collect the list of downloaded organisms. Organism names
    are built by concatenating species and strain names.

	install-organism -v 1 -group fungi \
    	    -species_file fungi_available_species.txt \
	    -task list \
	    -o fungi_downloaded_orgnanisms.txt

    Step 5: extract fasta sequences for different types of genomic
    regions, and run some control tests (e.g. oligonucleotide
    frequencies of start and stop codons). With the option -batch, the
    tasks are sent to a job scheuler (qsub) in order to be executed in
    parallel.

	install-organism -v 1 \
	    -org_file fungi_downloaded_orgnanisms.txt \
	    -task start_stop,allup,seq_len_distrib,genome_segments \
	    -task protein_seq, protein_len,fasta_genome,fasta_genome_rm \
	    -task chrom_sizes,index_bedtools \
	    -batch

    Step 6: compute the oligomer frequency tables. In batch mode, this
    task can be done only after the previous job list is finished,
    because of the dependencies between the parallelized tasks
    (upstream sequences have to be computed before computing their
    oligonucleotide and dyad frequencies).

	install-organism -v 1 \
	    -org_file fungi_downloaded_orgnanisms.txt \
	    -task upstream_freq,genome_freq,protein_freq,oligos,dyads \
	    -batch

SEE ALSO

    download-organisms

        The program I<install-organism> performs all the formatting
	and calibration tasks for importing genomes from the reference
	databases (NCBI, EMBL) to RSAT.

	The program I<download-organism> transfers the RSAT-formatted
	genomes from a RSAT server. 

 	If a genome is available on the RSAT server, it is recommended
	to use download-genomes in order to obtain it immediately in
	the RSAT format, rather than install-genomes.


    genome-blast
	run blast for all the peptidic sequences of a query organism
	against those of a reference organism (db), and generate a
	table with all the detected similarities.

End_of_help
    close HELP;
    exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
install_organism options
------------------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-v		verbose
-n		dry run (print commands without executing them)
-org		organism name without spaces (e.g. Saccharomyces_cerevisiae); can be used iteratively
-organism	full organism name (e.g. Saccharomyces cerevisiae)
-taxon		iterate installation tasks over all the previously installed organisms of a taxon
-org_file	organism_file_name
-group          taxonomic group required for the URL of the NCBI/refseq FTP site.
-species        species name (for tasks download, list and parse)
-species_file   file containing species names in the first column
-strain         strain (for task parse)
-max_strains    max number of strains per species to be parsed / installed
-skip #         Skip the # first organisms (convenient to resume interrupted list) 
-last #         Stop after the # first organisms (convenient to install by chunks) 
-source		data source (e.g. refseq);
-dir		absolute path of the installation directory
-batch  	run some tasks (for examplethe calibration of oligos and dyads) in batch mode.
-batch_org      when installing multiple orgnaisms, send each organism genome-blasseparately to the job scheduler
-queue          specify a queue to send the job with the option -batch (default: $ENV{CLUSTER_QUEUE}) 
-config		alternative organism configuration file
-backup_org_table  Store a backup copy of the organism table before overwriting it.
-local		update local config file 
		(specified by the environment variable RSA_LOCAL_CONFIG)
-refseq	        refseq directory
-ensembl	ensembl directory
-task		installation task ($supported_tasks)
-rm		calibrate oligo and dyad frequncies on repeat masked sequences
-syn		synonym table
-up_from       	distal limit of the upstream regions (e.g. -800 for yeast)
-up_to		proximal limit of the upstream regions (e.g. -1)
-prefid feattype idname     passed to parse-genbank.pl
-date 		 force last_update to a given date (for synchro between server and local installation)
-img_format	 image format for the graphs of sequence length distribution

End_short_help
  close HELP;
  exit;
}

################################################################
## Read arguments 
sub ReadArguments {
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    #    foreach my $a (0..$#ARGV) {
    ### verbose ###
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

      ## dry run
    } elsif ($arg eq "-n") {
      $dry = 1;
      $main::verbose = 1;

      ## detailed help
    } elsif ($arg eq "-h") {
      &PrintHelp();

      ## list of options
    } elsif ($arg eq "-help") {
      &PrintOptions();

      ## output file
    } elsif ($arg eq "-o") {
      $outputfile = shift(@arguments);


      ## Organism name in one word (spaces replaced  by underscores)
    } elsif ($arg eq "-org") {
      my $organism_short_name = shift(@arguments);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: cannot contain spaces") if ($organism_short_name =~ /\s/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should not be empty.") unless ($organism_short_name =~ /\S/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should contain at least one underscore.") unless ($organism_short_name =~ /_/);
      push @organisms, $organism_short_name;

      ## Species, used to define the FTP site for the task "download"
    } elsif ($arg eq "-species") {
      my $species = shift(@arguments);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: cannot contain spaces") if ($species =~ /\s/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should not be empty.") unless ($species =~ /\S/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should contain at least one underscore.") unless ($species =~ /_/);
      push @species, $species;


      ## Input file with organism list
    } elsif ($arg eq "-species_file") {
      $infile{species} = shift(@arguments);


      ## Strain, used to define the FTP site for the task "download"
    } elsif ($arg eq "-strain") {
      $strain = shift(@arguments);

      ## Strain, used to define the FTP site for the task "download"
    } elsif ($arg eq "-max_strains") {
      $max_strains = shift(@arguments);
      &RSAT::error::FatalError($max_strains, "is not a valid value for option -max_strains. Should be a Natural number. ") unless (&RSAT::util::IsNatural($max_strains));

      ## Group, used to define the FTP site for the task "download"
    } elsif ($arg eq "-group") {
      $group = shift(@arguments);

      ## Input file with organism list
    } elsif ($arg eq "-org_file") {
      $infile{organisms} = shift(@arguments);

      ## Taxon
    } elsif ($arg eq "-taxon") {
      push @taxa, shift(@arguments);

      ## All organisms found in the Refseq dir
    } elsif ($arg eq "-all_organisms") {
      $main::all_organisms = 1;

      ## Filter assemblies based on one or more criteria
    } elsif ($arg eq "-filter") {
	my $filter_field = shift(@arguments);
	my $filter_value = shift(@arguments);
	$filter{$filter_field} = $filter_value;
	
      ## Format to export lists of selected assemblies / organisms
    } elsif ($arg eq "-list_format") {
	$list_format = shift(@arguments);
	unless ($supported_list_format{$list_format}) {
	    &RSAT::error::FatalError($list_format,
				     "is not a valid value for option -list_format. Supported: ",
				     join(", ", keys %supported_list_format));
	}
      
      ## Skip the N first organisms of the list (can be useful if the
      ## task was interrupted).
    } elsif ($arg eq "-skip") {
      $main::skip = shift(@arguments);
      &RSAT::error::FatalError($main::skip, "Invalid value for option -skip, should be a Natural number") 
	  unless (&IsNatural($main::skip));

      ## Stop after the N first organisms of the list (can be useful
      ## for tests on a subset of the list of oganisms).
    } elsif ($arg eq "-last") {
      $main::last = shift(@arguments);
      &RSAT::error::FatalError($main::last, "Invalid value for option -last, should be a Natural number") 
	  unless (&IsNatural($main::last));


      ## Full organism name (may include spaces)
    } elsif ($arg eq "-organism") {
      $user_specified_orgname = shift(@arguments);

      ## synonyms
    } elsif ($arg =~ /-syn/) {
      $infile{synonyms} = shift(@arguments);

      ## Specify the limits of upstream regions
    } elsif ($arg eq "-up_from") {
      $up_from = shift(@arguments);
      &FatalError($up_from, "Invalid value for the up_from parameter (must be integer)") unless (&IsInteger($up_from));
      &FatalError($up_from, "Invalid value for the up_from parameter (must be negative)") if ($up_from >= 0);

    } elsif ($arg eq "-up_to") {
      $up_to = shift(@arguments);
      &FatalError(join ("\t", $up_to, "Invalid value for the to parameter (must be integer)")) unless (&IsInteger($up_to));

    } elsif ($arg eq "-prefid") {
      $parse_options .= join(" " , " -prefid", shift(@arguments), shift(@arguments));

    } elsif ($arg eq "-date") {
      $force_date = shift(@arguments);

      ## installation dir
    } elsif ($arg =~ /^-dir/) {
      $install_dir = shift(@arguments);

      ## Refseq dir
    } elsif ($arg =~ /^-refseq/) {
      $dir{refseq} = shift(@arguments);

      ## Genbank dir
    } elsif ($arg =~ /^-genbank/) {
      &RSAT::error::FatalError("Option -genbank is obsolete, use -refseq instead.");

      ## ENSEMBL dir
    } elsif ($arg =~ /^-ensembl/) {
      $dir{ensembl} = shift(@arguments);

      ## data source
    } elsif ($arg eq "-source") {
      $source = shift(@arguments);

      ## Batch mode, one job per task
    } elsif ($arg eq "-batch") {
      $batch = 1;

      ## Batch mode, one job per organism
    } elsif ($arg eq "-batch_org") {
      $batch_org = 1;

      ## Queue for the job scheduler
    } elsif ($arg eq "-queue") {
      $cluster_queue = shift(@arguments);

      ## Masking modes
    } elsif ($arg eq "-rm") {
      $task{mask} = 1;
      push @masking_modes, "-rm";

      ## task selection
    } elsif (($arg eq "-task")
	     || ($arg =~ /^-step/)) {
	my @requested_tasks = split ",", shift(@arguments);
	foreach my $task (@requested_tasks) {
	    next unless $task;
	    if ($supported_task{$task}) {
		$task{$task} = 1;
		push @tasks, $task;
	    } else {
		&RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks");
	    }
      }

      ## image format
    } elsif ($arg eq "-img_format") {
      $img_format = lc(shift(@arguments));

      ## local configuration file specified with an environment variable
    } elsif ($arg =~ /^-local/) {
      unless ($ENV{'RSA_LOCAL_CONFIG'}) {
	die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
      }
      $local_config = 1;

      ## alternative configuration file
    } elsif ($arg eq "-config") {
      $ENV{'RSA_LOCAL_CONFIG'}  = shift(@arguments);
      unless ($ENV{'RSA_LOCAL_CONFIG'}) {
	die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
      }
      $local_config = 1;

      ## Store a backup copy of organism table before overwriting it
    } elsif ($arg eq "-backup_org_table") {
      $main::backup_org_table = 1;

=pod

=item B<-dry>

Dry run: print the commands but do not execute them.

=cut
    } elsif ($arg eq "-dry") {
	$main::dry = 1;;

=pod

=item B<-nodie>

Do not die in case a sub-program returns an error.

The option -nodie allows you to circumvent problems with specific
sub-tasks, but this is not recommended because the results may be
incomplete.

=cut

      } elsif ($arg eq "-nodie") {
	$main::die_on_error = 0;

    }
  }
}

################################################################
## Retrieve start and stop codons and calculate word occurrences
## (for checking)
sub StartAndStopCodons {
  &RSAT::message::TimeWarn("&StartAndStopCodons()", $organism_short_name) if ($main::verbose >= 2);
  #    my $label =  "orf";
  my $label =  "id,ctg,reg_left,reg_right,orf_strand";
  my $prefix = $dir{genome}."/".${organism_short_name};
  my $job_prefix = ${organism_short_name}."_start_codons";
  my $command = &RSAT::server::GetProgramPath("retrieve-seq");
  $command .= " -v 1";
  $command .= " -org ".$organism_short_name;
  $command .= " -all";
  $command .= " -type upstream -feattype cds -from 0 -to 2 ";
  $command .= " -format wc -nocomment -label ".$label;
  $command .= " -o ".$prefix."_start_codons.wc";
  $command .= " ; ";
  $command .= &RSAT::server::GetProgramPath("oligo-analysis");
  $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort";
  $command .= " -i ".$prefix."_start_codons.wc";
  $command .= " -o ".$prefix."_start_codon_frequencies.tab";
#  &RSAT::message::Debug($command) if ($main::verbose >= 10);
  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
#  &doit($command, $dry, $die_on_error, $main::verbose);
  &RSAT::message::Info("\tStart codons", $prefix."_start_codons.wc") if ($main::verbose >= 2);
  &RSAT::message::Info("\tStart codon frequencies", $prefix."_start_codon_frequencies.tab") if ($main::verbose >= 2);

  $job_prefix = ${organism_short_name}."_stop_codons";
  $command = &RSAT::server::GetProgramPath("retrieve-seq");
  $command .= " -v 1";
  $command .= " -org ".$organism_short_name;
  $command .= " -all";
  $command .= " -type downstream  -feattype cds -from -2 -to 0 ";
  $command .= " -format wc -nocomment -label ".$label;
  $command .= " -o ".$prefix."_stop_codons.wc";
  $command .= " ; ";
  $command .= &RSAT::server::GetProgramPath("oligo-analysis");
  $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort";
  $command .= " -i ".$prefix."_stop_codons.wc";
  $command .= " -o ".$prefix."_stop_codon_frequencies.tab";

  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
#  &doit($command, $dry, $die_on_error, $main::verbose);
  &RSAT::message::Info("\tStop codons", $prefix."_stop_codons.wc") if ($main::verbose >= 2);
  &RSAT::message::Info("\tStop codon frequencies", $prefix."_stop_codon_frequencies.tab") if ($main::verbose >= 2);
}


################################################################
## Parse the genome from Genbank files
sub ParseGenome {
  my ($genus_species, $strain, $organism_name, $strain_dir, $genome_dir, $org_nb, %args) = @_;
  $strain_dir = $strain_info{$strain}->{"downloads"};
  $organism_name = $strain_info{$strain}->{"organism"};
  $genus_species = $strain_info{$strain}->{"genus_species"};
  &RSAT::message::TimeWarn("Parsing genome", $org_nb, $genus_species, $strain, $organism_name, $strain_dir) if ($main::verbose >= 1);
  my $command = "$ENV{RSAT}/perl-scripts/parse-genbank.pl -v ".($main::verbose -1);
  $command .= " -i ".$strain_dir;
  $command .= $parse_options;
  if ($dir{ensembl}) {
    $command .= " -ext dat";
  } else {
    $command .= " -ext gbff"; ## 2019 format in NCBI
    $command .= " -genomeInFasta "; ## 2019 format in NCBI has the genome in fasta version separated from the genebank format
  }
  $command .= " -org '".$organism_name."'";
  #    if ($source ne $null) {
  $command .= " -source '$source'";
  #    }
  $command .= " -o ".$genome_dir;

  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);
}

################################################################
## Clean up unnecessary files to save disk space
sub CleanUp {
  &RSAT::message::TimeWarn("&CleanUp()") if ($main::verbose >= 2);
  chdir $dir{genome};

  ## delete files with intergenic and gene segment sequences
  my @files = ();
  foreach my $seq_type (qw(intergenic gene upstream upstream-noorf)) {
    foreach my $format (qw(wc fasta)) {
      foreach my $extension ("", ".gz") {
	foreach my $segments ("", "_segments") {
	  foreach my $purged ("", "_purged") {
	    my $file = "${organism_short_name}_${seq_type}${segments}${purged}.${format}${extension}";
	    if (-e $file) {
	      push @files, $file;
	    }
	  }
	}
      }
    }
  }
  foreach my $file (@files) {
    my $command = "rm -f $file";
    &doit($command, $dry, $die_on_error, $main::verbose);
  }
}



################################################################
## Dowload the assembly file for the group
sub DownloadAssemblySummary {
    my ($group) = @_;

    my $assembly_summary_url = join("/", "rsync://ftp.ncbi.nlm.nih.gov", "genomes", "refseq", $group, "assembly_summary.txt");
    
    if ($main::verbose >= 1) {
	&RSAT::message::TimeWarn("Downloading NCBI assembly summary file for group\t", $group);
     	&RSAT::message::Info("\t".$assembly_summary_url);
    }    
    &RSAT::util::CheckOutDir($dir{downloads});

    # my $command = "wget -nc ${assembly_url} --no-clobber --output-document ${assembly_file} ; ";
    my $command = "wget --no-clobber -O ".$dir{downloads}."/README_assembly_summary.txt https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt; ";
    &doit($command, $dry, $die_on_error, $main::verbose);

#     $command = "rsync -ruptvl -R --verbose ".$assembly_summary_url." ".$dir{downloads};
    $command = "rsync -ruptvl -R --verbose ".$assembly_summary_url." ".$dir{downloads};
    $command .= " && head -2 $outfile{assembly_summary} | tail -1 | perl -pe 's/\t/\n/g' | awk '{sum +=1; {print sum\"\t\"\$0}}' > $outfile{assembly_columns}";
    &doit($command, $dry, $die_on_error, $main::verbose);

    ## Display file names
    if ($main::verbose >= 1) {
	&RSAT::message::TimeWarn("Downloaded group assembly summary file");
	&RSAT::message::Info("Assembly summary file\t", $outfile{assembly_summary});
	&RSAT::message::Info("Assembly summary file columns\t", $outfile{assembly_columns});
    }
    my $available_assemblies = `grep -v '^#' $outfile{assembly_summary} | wc -l`;
    chomp($available_assemblies);
    printf $out (";    %-25s\t%s\n", "Assembly summary file", $outfile{assembly_summary});
    printf $out (";    %-25s\t%s\n", "Assembly summary columns", $outfile{assembly_columns});
    printf $out (";    %-25s\t%s\n", "Available assemblies", $available_assemblies);
    
}


################################################################
## Read assembly summary file as provided by NCBI
sub ReadAssemblySummary {
    my ($file) = @_;
    
    ## Check if the file exists
    if (-e $file) {
	&RSAT::message::TimeWarn("Reading assembly summary file\t", $file) if ($main::verbose >= 1);
    } else {
#	die"HELLO";
	&RSAT::error::FatalError("Missing assembly summary file\t", $file);
    }

    ## Parse the file
    my @header;
    my %strain_info;
    &RSAT::message::TimeWarn("Reading assembly summary table\t", $file) if ($main::verbose >= 1);
    my $l = 0; ## Line number in the assembly file
    my $selected = 0; ## Number of selected assemblies
    my ($asmb) = &OpenInputFile($file);
    my @column_index;
    while (<$asmb>) {
	s/\r//;
	$l++;
	chomp();
	if (/^#/) {
	    if (/^#assembly_accession/) {
		## Parse column headers
		s/^#//;
		@header = split("\t");
		for $h (0..$#header) {
		    $column_index{$header[$h]} = $h;
		}
		if ($main::verbose >= 3) {
		    print $out "; Column content\n";
		    for my $f (1..scalar(@header)) {
			print $out join("\t", ";", $f, $header[$f-1]), "\n";
		    }
		}
	    } else {
		next; ## Skip comment rows
	    }
	} else {

	    my $pass_filter = 1;

	    my @fields = split("\t");

	    ################################################################
	    ## Run filters on user-specified fields
	    for my $field (sort(keys(%filter))) {
		unless ($fields[$column_index{$field}] =~ /$filter{$field}/) {
		    $pass_filter = 0;
		    last;
		}
	    }
	    next unless $pass_filter;	    

	    ################################################################
	    ## Fix problems with some organism or assembly names that contain
	    ## problematic characters for folder names (brackets,
	    ## parentheses, quotes, ...).
	    my $organism_name = &CleanPathName($fields[$column_index{'organism_name'}]);
	    if ($fields[$column_index{'organism_name'}] ne $organism_name) {
		&RSAT::message::Warning("Organism name fixed", $fields[$column_index{'organism_name'}],  $organism_name) if ($main::verbose >= 5); 
		$fields[$column_index{'organism_name'}] = $organism_name;
	    }

	    my $assembly_accession = &CleanPathName($fields[$column_index{'assembly_accession'}]);
	    if ($fields[$column_index{'assembly_accession'}] ne $assembly_accession) {
		&RSAT::message::Warning("Assembly accession", $fields[$column_index{'assembly_accession'}],  $assembly_accession) if ($main::verbose >= 3); 
		$fields[$column_index{'assembly_accession'}] = $assembly_accession;
	    }

	    my $asm_name = &CleanPathName($fields[$column_index{'asm_name'}]);
	    if ($fields[$column_index{'asm_name'}] ne $asm_name) {
		&RSAT::message::Warning("Assembly name fixed", $fields[$column_index{'asm_name'}],  $asm_name) if ($main::verbose >= 3); 
		$fields[$column_index{'asm_name'}] = $asm_name;
	    }

	    ################################################################
	    ## Compute some fields by combining extracted field
	    my $current_strain = join("_", $fields[$column_index{'assembly_accession'}], $fields[$column_index{'asm_name'}]);
#	    $current_strain =~s/ /_/g; ## Fix problem with some spaces in asm_name

	    ## Check if current strain corresponds to selected strain
	    if ($strain) {
		next unless $current_strain eq $strain;
	    }
	    
	    next unless $fields[$column_index{'assembly_accession'}] && $fields[$column_index{'asm_name'}] && $current_strain;

	    ## Split organism_name into genus and species
	    ##
	    ## Note: this assumes that genus name does not contain any
	    ## underscore. Besides , "species" here may include a
	    ## subspecies suffix (e.g. strain name), but we count them
	    ## together as in NCBI.
	    my ($genus, $species) = split("_", $organism_name, 2);
	    my $genus_species = $genus."_".$species;

#	    &RSAT::message::Debug("genus_species : ", $genus_species) if ($main::verbose >= 10);
	    
	    ## Filter on species
	    if (scalar(@species) > 0) {
		unless ($selected_species{$genus_species}) {
		    next;
		}
	    }

	    # ## Run filters on user-specified fields
	    # for my $field (sort(keys(%filter))) {
	    # 	unless ($fields[$column_index{$field}] =~ /$filter{$field}/) {
	    # 	    $pass_filter = 0;
	    # 	    last;
	    # 	}
	    # }
	    # next unless $pass_filter;	    
	    
	    ## Create an entry for this assembly in the strain_info hash
	    $selected += 1;
	    for my $f (0..$#fields) {
		$strain_info{$current_strain}->{$header[$f]} = $fields[$f];
		# &RSAT::message::Debug(
		#     $f,
		#     $header[$f],
		#     $strain_info{$current_strain},
		#     join(";", sort(keys(%strain_info))),
		#     $fields[$f],
		#     $strain_info{$current_strain}->{$header[$f]},
		#     ) if ($main::verbose >= 10);
	    }

	    # die (join("\t", "BOUM", $current_strain,
	    # 	      $strain_info{$current_strain},
	    # 	      join("; ", sort(keys(%strain_info)))),
	    # 	      "\n");
	    
	    ## Define some extra fields
	    $strain_info{$current_strain}->{"n"} = $selected;
	    $strain_info{$current_strain}->{"genus"} = $genus;
	    $strain_info{$current_strain}->{"species"} = $species;
	    $strain_info{$current_strain}->{"genus_species"} = $genus_species;
	    $strain_info{$current_strain}->{"strain"} = $current_strain;
	    $strain_info{$current_strain}->{"organism"} = join("_", $genus, $species, $current_strain);
	    $strain_info{$current_strain}->{"organism"} =~ s/ //g;

#	    die (join("'\t'", "HELLO", $organism_name, $genus, $species, $assembly_accession, $asm_name, $current_strain, $strain_info{$current_strain}->{"organism"}), "\n");
	    
	    ## Define download directory for this assembly
	    ## Will be used for both tasks download and parse
	    $strain_info{$current_strain}->{"downloads"} =
		join("/",
		     $ENV{RSAT},
		     "downloads",
		     "refseq",
		     $group,
		     $strain_info{$current_strain}->{"genus"},
		     $strain_info{$current_strain}->{"genus_species"},
		     $current_strain);
	    
	    # &RSAT::message::Debug("genus : ", $genus) if ($main::verbose >= 10);
	    # &RSAT::message::Debug("species : ", $species) if ($main::verbose >= 10);
	    # &RSAT::message::Debug("genus_species : ", $genus_species) if ($main::verbose >= 10);
	    # &RSAT::message::Debug("download dir", $strain_info{$current_strain}->{"downloads"}) if ($main::verbose >= 10);

	    ## For debugging, report the 10 first assemblies
	    if (($main::verbose >= 3) && ($selected <= 10)) {
		print join("\t",
			   ";",
			   $selected,
			   $l,
			   $strain_info{$current_strain}->{"organism"},
			   $strain_info{$current_strain}->{"organism_name"},
			   $strain_info{$current_strain}->{"genus"},
			   $strain_info{$current_strain}->{"species"},
			   $strain_info{$current_strain}->{"assembly_accession"},
			   $strain_info{$current_strain}->{"asm_name"},
			   $strain_info{$current_strain}->{"genus_species"},
			   $strain_info{$current_strain}->{"strain"},
		    ), "\n";
	    }
	}
    }

    if ($main::verbose >= 1) {
	&RSAT::message::TimeWarn("Read assembly summaries");
	&RSAT::message::Info($l, "lines");
	&RSAT::message::Info($selected, "selected assemblies");
    }
    return(%strain_info);
}


################################################################
## Export a table with a subset of the fields for selected organisms
sub ListAssemblies {
    my ($list_format, %strain_info) = @_;

    my @sorted_strains = sort {$strain_info{$a}->{"organism"} cmp $strain_info{$b}->{"organism"}} keys(%strain_info);
#    my @sorted_strains = sort {$strain_info{$a}->{"n"} <=> $strain_info{$b}->{"n"}} keys(%strain_info);
#    my @sorted_strains = sort keys(%strain_info);
    
    if ($list_format eq "org") {
	for my $current_strain (@sorted_strains) {
	    print $out $strain_info{$current_strain}->{"organism"}, "\n" if ($current_strain);
	}

	
    } elsif ($list_format eq "bash") {
	## Print a bash file with the instructions to download and install the selected genomes from
	print $out "#!bash\n";
	print $out "TASKS=download,parse,config,default\n";
	for my $current_strain (@sorted_strains) {
	    if ($strain_info{$current_strain}->{"genus_species"}) {
		print $out "install-organism -v 1 -group ", $group, 
		    " -species ", $strain_info{$current_strain}->{"genus_species"},
		    " -strain ", $strain_info{$current_strain}->{"strain"},
   		    " -task \$TASKS",
		    "\n";
	    }
	}

    } else {
	## Print a table with selected fields
	my @fields = qw(
 	    organism
	    taxid
	    species_taxid
	    genus_species
	    strain
	    genus
	    species
	    assembly_accession
	    asm_name
            n
	    );
	print $out "#", join("\t", @fields, "n"), "\n";
	$n = 0;
	for my $current_strain (@sorted_strains) {
#	    $n++;
	    my @values = ();
	    for my $field (@fields) {
		push @values, $strain_info{$current_strain}->{$field};
	    }
#	    push @values, $n;
	    print $out join("\t", @values), "\n";
	}
    }
}

################################################################
## Suppress empty rows from peptidic sequence files (present in NCBI
## genomes) because the first empty row raises a bug with diamond
## formatdb and blastall.
sub CleanProteinSequences {
    my ($peptidic_seq_file) = @_;
    
    if (-e $peptidic_seq_file) {
	my $empty_rows = `grep '^\$' ${peptidic_seq_file} | wc -l `;
	$empty_rows =~ s/\s//g; 
	if ($empty_rows > 0) {
	    if ($main::verbose >= 3) {
		&RSAT::message::Warning("Peptidic sequence file contains ".$empty_rows." empty rows");
		&RSAT::message::Info("Cleaning  ".$empty_rows." empty rows from peptidic sequence fasta file");
	    }
	    &doit('perl -pi -e "s/^\n$//" '.$peptidic_seq_file);
	}
    } else {
	&RSAT::message::Warning("Peptidic sequence file is missing", $peptidic_seq_file);
    }
}


sub CleanPathName {
    my ($name) = @_;
    # suppress problematic characters
    $name =~ s/ /_/g;
    $name =~ s/\[//g;
    $name =~ s/\]//g;
    $name =~ s/\'//g;
    $name =~ s/\"//g;
    $name =~ s/\)//g;
    $name =~ s/\(//g;
    $name =~ s/\///g;
    return ($name);
}
