#!/usr/bin/env perl ############################################################ # # $Id: install-organism,v 1.129 2013/10/13 12:24:00 jvanheld Exp $ # # Time-stamp: <2017-03-17 10:53:06 eda> # ############################################################ ## TO DEBUG: with option -org_list all organisms get the same name ## (the first one of the list). JvH 2016-02-17. # use strict; BEGIN { if ($0 =~ /([^(\/)]+)$/) { push (@INC, "$`lib/"); } } require "RSA.lib"; require "RSAT_to_ensembl.lib.pl"; use RSAT::util; use RSAT::Tree; use RSAT::TreeNode; use RSAT::organism; if ($0 =~ /([^(\/)]+)$/) { push (@INC, "$`lib/"); } ## Main package package main; { ################################################################ ## Initialize parameters local $start_time = &RSAT::util::StartScript(); our $org_table = $ENV{RSAT}."/public_html/data/supported_organisms.tab"; our @masking_modes = (""); ## by default, use no masking our @oligo_lengths=(6,1,2,3,4,5,7,8); our $verbose = 0; our $seq_format = "fasta"; ## default format for exporting sequences our $img_format = $ENV{rsat_img_format} || "png"; our $noov=""; our $strands="-1str"; our $purged_frequencies = 0; #### temporarily inactivated because for human genome it suppresses almost all sequences !!!! our %supported_for_installation; $supported_for_installation{"fasta"} = 1; $supported_for_installation{"raw"} = 1; $supported_for_installation{"filelist"} = 1; our @default_tasks = qw( start_stop allup seq_len_distrib genome_segments upstream_freq protein_seq protein_freq protein_len oligos dyads fasta_genome fasta_genome_rm chrom_sizes index_bedtools ); # config ## JvH suppresed config from default tasks to enable submission to job scheduler without destroying the table of supported organisms our @extra_tasks = qw( config genome_freq phylogeny uninstall erase distrib ncf intergenic_freq ensembl_freq clean default rewrite all ); ## Tasks to handle download and parsing of genomes from NCBI ftp/rsync site. our @species_tasks = qw( available download list parse); ## Tasks compatible with batch execution our @batch_tasks = qw( phylogeny start_stop allup seq_len_distrib genome_segments upstream_freq genome_freq protein_seq protein_freq protein_len oligos dyads default fasta_genome fasta_genome_rm chrom_sizes index_bedtools distrib ncf intergenic_freq ensembl_freq ); our @organism_tasks = (@default_tasks, @extra_tasks); foreach my $task (@organism_tasks) { $organism_task{$task} = 1; } foreach my $task (@species_tasks) { $species_task{$task} = 1; } our @supported_tasks = (@species_tasks, @organism_tasks); foreach my $task (@supported_tasks) { $supported_task{$task} = 1; } $supported_tasks = join ",", @supported_tasks; foreach my $task (@batch_tasks) { $batch_task{$task} = 1; } our %task = (); our @tasks = (); $parse_options = ""; $null = ""; ## NULL value $source = $null; $taxonomy = $null; $taxid = $null; @taxa = (); our %infile = (); ## Export format for the lists of organisms or assemblies %supported_list_format = ("org" => 1, "table" => 1, "bash" => 1); our $list_format = "table"; # Default list format. ## File containing a list of organisms to install $infile{organisms} = ""; ## File containing a list of species to install from NCBI/Refseq FTP site $infile{species} = ""; ## List of organisms our @species = (); # for download and install our @organisms = (); # organisms are concatenation of species + strain ## Strain our $strain = ""; our $max_strains = ""; # max number of strains per species for parsing task ## Group our $group = ""; ## options for testing and debugging $skip = 0; ## Skip the top organisms from the lsit $last = 0; ## Stop after a given number of organisms from the list our $backup_org_table = 0; ## installation date local $install_date = $force_date || &AlphaDate(); chomp $install_date; ## Store the starting directory $dir{main} = $ENV{PWD}; ## Directory to download files from NCBI $dir{downloads} = $ENV{DOWNLOAD_DIR} || $ENV{RSAT}."/downloads/"; # $dir{refseq} = $ENV{REFSEQ_DIR} || $ENV{RSAT}."/downloads/refseq"; ## Options for the job scheduler our $dry = 0; our $die_on_error = 0; our $batch = 0; our $job_prefix = "install_org"; our $log_handle = ""; our $err_handle = ""; our $cluster_queue = $ENV{CLUSTER_QUEUE}; our $download_date = ""; ################################################################ ## Read arguments &ReadArguments(); ################################################################ ## Open output stream our $out = &OpenOutputFile($outputfile); ################################################################ ## Check parameter values @task = keys %task; if ($#task == -1) { &RSAT::error::FatalError("You should specify at least one task.\nSupported tasks\n\t$supported_tasks\n"); } ## Set on the default tasks is the task "default" was invoked if ($task{default}) { foreach my $task (@default_tasks) { $task{$task} = 1; } &RSAT::message::Info("Default tasks selected: ", join(", ", @default_tasks)) if ($main::verbose >= 2); } ## Run all the tasks for each organism (not the species-wide tasks ## like downloading and parsing). Note: by "organism" we mean here a ## given strain of a given species. if ($task{all}) { %task = %organism_task; } ## Check compatibility between batch and task dependencies if ($batch) { if ($task{allup} && $task{upstream_freq}) { &RSAT::error::FatalError("Tasks allup and upstream_freq cannot be combined in batch mode. First run allup, then upstream_freq."); } if ($task{config}) { &RSAT::error::FatalError("Task config cannot be executed in batch mode. "); } } ## Consistency between tasks if ($task{oligos} || $task{dyads}) { unless (($task{upstream_freq}) || ($task{intergenic_freq}) || ($task{protein_freq}) || ($task{genome_freq}) || ($task{ensembl_freq})) { &RSAT::error::FatalError("The tasks 'oligos' and 'dyads' require to specify at least one sequence type among the following. ", "\n\tupstream_freq,intergenic_freq,genome_freq,protein_freq,ensembl_freq"); } } if (($task{protein_freq}) && !($task{oligos})) { &RSAT::error::FatalError("The task 'protein_freq' requires to activate the task 'oligos'"); } ## Count the number of species-wide and organism-specific takss our $requested_species_tasks = 0; our $requested_organism_tasks = 0; &RSAT::message::TimeWarn("Checking tasks", join(";", keys(%task))) if ($main::verbose >= 2); foreach my $task (keys(%task)) { if ($species_task{$task}) { $requested_species_tasks++; } if ($organism_task{$task}) { $requested_organism_tasks++; } } &RSAT::message::Info("Species tasks", $requested_species_tasks) if ($main::verbose >= 2); &RSAT::message::Info("Organism tasks", $requested_organism_tasks) if ($main::verbose >= 2); ################################################################ ## If a species file has been specified, read species list if ($infile{species}) { &RSAT::message::TimeWarn("Reading species file", $infile{species}) if ($main::verbose >= 1); my ($speciess) = &OpenInputFile($infile{species}); while (<$speciess>) { next unless (/\S/); ## Skip empty rows next if (/^;/); ## Skip comment rows next if (/^#/); ## Skip header rows chomp(); my ($species) = split /\s+/; push @species, $species; } &RSAT::message::Info(scalar(@species), "species read from file", $infile{species}) if ($main::verbose >= 1); } ## Index the selected species our %selected_species = (); for my $species (@species) { $selected_species{$species} = 1; } &RSAT::message::Info("Selected species", join("; ", keys(%selected_species))) if ($main::verbose >= 1); ################################################################ ## Define the local download directory for NCBI Refseq files #$source = "NCBI-Refseq"; our $source = "NCBI"; ## 2019 this format works better for the newest formating in NCBI ## Check the directory containing the Genbank-formatted organisms ## (which can be downloaded from NCBI/refseq). unless ($dir{refseq}) { # Refseq dir specified on the command line $dir{refseq} = $dir{downloads}."/refseq"; } &RSAT::util::CheckOutDir($dir{refseq}); ################################################################ ## Check consistency of arguments for download/list/parse tasks. if ($task{parse} || $task{download} || ($task{config} && ($group))) { ## Check that all required parameters are defined ## One taxonomic group unless ($group) { &RSAT::error::FatalError("Tasks download, list and parse require to specify a group (e.g. fungi, bacteria, ...) to which all the species to download should belong."); } ## At least one species is required # unless (scalar(@species) >= 1) { # &RSAT::error::FatalError("Tasks download, list, parse and config require to specify at least one species (e.g. Saccharomyces_cerevisiae, Escherichia_coli). To get a list of available species, use option '-task> available'. "); # } ## If strain is specified, check that only one species has been requested if ($strain) { &RSAT::error::FatalError("Option -strain is only compatible with a single species. ") unless (scalar(@species) == 1); } } our %remaining_task = %task; ################################################################ ## Assemblies our %strain_info; our @sorted_strains; our $selected_strains; if ((($infile{organisms} eq "") && (scalar(@organisms) == 0) && ($taxon eq "")) || $task{available} || $task{list} || $task{download} || $task{parse}) { ################################################################ ## Define the name of the assembly summary file ## File containing the assembly summary downloaded from NCBI # my $today = &trim(`date +%Y-%m-%d`); # $outfile{assembly_summary} = "ncbi_organisms_assembly_summary_".${group}."_refseq_".${today}.".tsv"; # $outfile{assembly_summary} = $dir{downloads}."/ncbi_organisms_assembly_summary_".${group}."_refseq.tsv"; unless ($group) { &RSAT::error::FatalError("Use the option -group to specify the taxonomic group"); } $dir{assembly_summary} = join("/", $dir{downloads}."refseq", $group); $outfile{assembly_summary} = join("/", $dir{assembly_summary}, "/assembly_summary.txt"); $outfile{assembly_summary} =~ s|//assembly|/assembly|; $outfile{assembly_columns} = $outfile{assembly_summary}; $outfile{assembly_columns} =~ s/\.txt/_colums.tsv/; # $outfile{assembly_columns} = "ncbi_organisms_assembly_summary_".${group}."_refseq_".${today}."_columns.tsv"; ################################################################ ## Download or update the assembly summary table for the selected group if ($task{available}) { &DownloadAssemblySummary($group); ## If all tasks have been completed, skip the rest delete($remaining_task{available}); if (scalar(keys(%remaining_task)) == 0) { &Finish(); } } ################################################################ ## Read assembly summary table %strain_info = &ReadAssemblySummary($outfile{assembly_summary}); @sorted_strains = sort {$strain_info{$a}->{"organism"} cmp $strain_info{$b}->{"organism"}} keys(%strain_info); $selected_strains = scalar(keys(%strain_info)); } ## List assemblies if requested if ($task{list}) { &ListAssemblies($list_format, %strain_info); ## If all tasks have been completed, skip the rest delete($remaining_task{list}); if (scalar(keys(%remaining_task)) == 0) { &Finish(); } } ################################################################ ## Print arguments if required &Verbose() if ($main::verbose >= 1); ## Rewrite config table if($task{rewrite}){ &RewriteConfigTab(); exit(); } ################################################################ ## Download the required files from NCBI FTP site if ($task{download}) { ## Download each selected strain my $strain_nb = 0; &RSAT::message::Info("Downloading", $selected_strains, "assemblies from NCBII") if ($main::verbose >= 1); for my $strain (@sorted_strains) { $strain_nb++; if (($skip > 0) && ($strain_nb <= $skip)) { &RSAT::message::TimeWarn("Skipping assembly", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); next; } if (($last > 0) && ($strain_nb > $last)) { &RSAT::message::TimeWarn("Stopping after assembly", ($strain_nb-1)."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); last; } &RSAT::message::TimeWarn("Downloading strain", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); ## Restrict the number of strains if requested if (($max_strains > 0) && ($strain_nb > $max_strains)) { my $remaining_strains = $selected_strains - $strain_nb + 1; &RSAT::message::Info("Max number of strains reached", $max_strains, "Skipping", $remaining_strains, "strains of", $selected_strains) if ($main::verbose >= 1); last; } ## Define source URL for download my $URL = $strain_info{$strain}->{"ftp_path"}; unless ($URL =~ "ftp.ncbi.nlm.nih.gov") { &RSAT::error::FatalError("Strain", $strain, "Invalid URL for NCBI download", "'".$URL."'"); } if ($main::verbose >= 1) { &RSAT::message::TimeWarn("Downloading genome for strain", $strain, "\n\tfrom NCBI FTP site", $URL, "\n\tto local dir", $strain_info{$strain}->{"downloads"}); } ## Create local download directory if needed &RSAT::util::CheckOutDir($strain_info{$strain}->{"downloads"}); #=============================================0 old block ===================================================== # rsync is going to be be retired by NCBI on june 2026, should be replaced by wget # also The directory structure available through https at NCBI is not the same as the one with rsync: # in particular the https://ftp.ncbi.nlm.nih.gov/genomes/..... is not available and we get # @ERROR: Unknown module "genomes" # # ## Create local download directory if needed #&RSAT::util::CheckOutDir($strain_info{$strain}->{"downloads"}); #$RSYNC_URL = $URL; #$RSYNC_URL =~ s/https/rsync/; #my $command = "rsync --copy-links --times --recursive --verbose ".$RSYNC_URL."/* ".$strain_info{$strain}->{"downloads"}."/"; #============================================================================================================================== #-------------------------------------- new block 07-05-2026 ------------------------------------------------------------ ## Create local download directory if needed &RSAT::util::CheckOutDir($strain_info{$strain}->{"downloads"}); my $command = "wget -r -l1 -np -nd -e robots=off " ."--accept '*.gz,*.txt,*.md5' " ."--reject 'index.html*' " ."-P ".$strain_info{$strain}->{"downloads"}." ".$URL; ## Download data from NCBI &doit($command, $dry, $die_on_error, $main::verbose); &RSAT::message::TimeWarn("Downloaded genome in dir", $strain_info{$strain}->{"downloads"}) if ($main::verbose >= 3); ## Store download date in strain_info $download_date = &AlphaDate(); $strain_info{$strain}->{"download_date"} = $download_date; } } ## Parse if ($task{parse}) { my $strain_nb = 0; # my $selected_strains = scalar(keys(%strain_info)); if ($main::strain) { @sorted_strains = $strain; $species = $species[0]; unless ($species) { &FatalError("Option -strain requires to specify species (i.e. Genus_species) with option -species"); } my ($genus, @species_pieces) = split(/[ _]/, $species); my $species = join "_", @species_pieces; ## join the elements of species, which sometimes contain sub-species levels e.g. strain my $organism = join("_", $genus, $species, $strain); $strain_info{$strain}->{"strain"} = $strain; $strain_info{$strain}->{"species"} = $species; $strain_info{$strain}->{"genus_species"} = $genus."_".$species; $strain_info{$strain}->{"organism"} = $organism; $strain_info{$strain}->{"downloads"} = join("/", $ENV{RSAT}, "downloads", "refseq", $group, $strain_info{$strain}->{"genus"}, $strain_info{$strain}->{"genus_species"}, $strain); &RSAT::message::Debug("organism= ", $organism, "\n", "genus= ", $genus, "\n", "species= ", $species, "\n", "genus_species= ", $strain_info{$strain}->{"genus_species"}, "\n", "strain= ", $strain, "\n", "downloads= ", $strain_info{$strain}->{"downloads"}, "\n", ) if ($main::verbose >= 5); # die "HELLO"; &RSAT::message::Info("Parsing a single strain specified with options -strain ${strain} -species ${species}") if ($main::verbose >= 1); } for my $strain (@sorted_strains) { $strain_nb++; if (($skip > 0) && ($strain_nb <= $skip)) { &RSAT::message::TimeWarn("Skipping assembly", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); next; } if (($last > 0) && ($strain_nb > $last)) { &RSAT::message::TimeWarn("Stopping after assembly", ($strain_nb-1)."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); last; } &RSAT::message::TimeWarn("Parsing strain", $strain_nb."/".$selected_strains, $strain_info{$strain}->{"organism"}) if ($main::verbose >= 1); my $organism_name = $strain_info{$strain}->{"organism"}; my $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_name."/genome"; &ParseGenome($strain_info{$strain}->{"species"}, $strain, $organism_name, $strain_info{$strain}->{"downloads"}, $genome_dir, $org_nb, %{$strain_info{$strain}}); } } ################################################################ ## Select organisms to install if ($main::all_organisms) { ## Select all organisms @organisms = &RSAT::OrganismManager::get_supported_organisms(); } elsif ($infile{organisms}) { ## If an organism file has been specified, read organism list my ($orgs) = &OpenInputFile($infile{organisms}); while (<$orgs>) { next unless (/\S/); ## Skip empty rows next if (/^;/); ## Skip comment rows next if (/^#/); ## Skip header rows chomp(); my ($org) = split /\s+/; push @organisms, $org; } } elsif (scalar(@taxa) > 0) { ## If a taxon has been specified, collect the list of ## organisms belonging to this taxon in the RSAT-supported ## organisms if ($task{parse}) { &RSAT::message::Warning("With the option -taxon, the parse option will only re-parse previously installed organisms from the selected taxon"); } $tree = new RSAT::Tree(); $tree->LoadSupportedTaxonomy("Organisms", \%supported_organism); foreach my $taxon (@taxa) { my $taxon_node = $tree->get_node_by_id($taxon); my @taxon_organisms = $taxon_node->get_all_descendents("DFS","leaf",undef,undef); &RSAT::message::TimeWarn("Collected", scalar(@taxon_organisms), "organisms for taxon", $taxon, $taxon_node) if ($main::verbose >= 1); foreach my $org_node (@taxon_organisms) { my $short_organism_name = $org_node->getid(); push @organisms, $short_organism_name; } } @organisms = sort (@organisms); } elsif (scalar(@organisms) == 1) { ## Install all organisms found in assembly summary and that ## passed the filters if specified for my $strain (@sorted_strains) { push @organisms, $strain_info{$strain}->{"organism"}; } } elsif (scalar(@sorted_strains) > 0) { for my $strain (@sorted_strains) { push @organisms, $strain_info{$strain}->{"organism"}; } } &RSAT::message::Info("Organisms to install", scalar(@organisms)) if ($main::verbose >= 1); # if ($main::verbose >= 10) { # &RSAT::message::Info(join("; ", @organisms)); # } ## Check that at least one organism has been specified unless ($task{list}) { unless ((scalar(@organisms) > 0) || ($task{parse})) { # if ($strain && $species[0]) { # @organisms = (join("_", $species, $strain)); # } else { &RSAT::error::FatalError("At least one organism should be specified (options -org, -taxon or -all_organisms)") # } } } ################################################################ ## Some options are incompatible with multiple organisms if (scalar(@organisms) > 1) { &RSAT::message::FatalError("The option -organism (organism full name) is not valid when multiple organisms are selected.") if ($user_specified_orgname); &RSAT::message::FatalError("The option -dir is (installation directory) not valid when multiple organisms are selected.") if ($install_dir); &RSAT::message::FatalError("The option -syn (synonym table) is not valid when multiple organisms are selected.") if ($outfile{synonyms}); } ################################################################ ## Check if any task has to be done on organisms (and not just to ## NCBI species). if ($requested_organism_tasks == 0) { &RSAT::message::Warning("No organism-specific task has been requested. ") if ($main::verbose >= 1); exit(0); } ################################################################ ## Iterate installation over selected organisms $i = 0; foreach our $organism_short_name (@organisms) { $i++; if (($skip > 0) && ($i <= $skip)) { &RSAT::message::TimeWarn("Skipping organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1); next; } if (($last > 0) && ($i > $last)) { &RSAT::message::TimeWarn("Stopping after organism", ($i-1)."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1); last; } &RSAT::message::TimeWarn("Installing organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1); $backup_org_table = 0 if ($i > 1); ## When we treat multiple organisms, we don't want to store a backup for each one if ($batch_org) { my $command = "install-organism"; $command .= " -v ".$main::verbose; $command .= " -group ".$group if ($group); $command .= " -org ".$organism_short_name; my @tasks_to_batch = (); foreach my $task (@tasks) { if ($batch_task{$task}) { push @tasks_to_batch, $task; } } my $org_tasks = join(",", @tasks_to_batch); $command .= " -task ".$org_tasks; &RSAT::message::Info("Organism batch command\t", $command) if ($main::verbose >= 1); &doit($command, $dry, $die_on_error, $main::verbose, $batch_org, $job_prefix, $log_handle, $err_handle, $cluster_queue); next; } unless (($task{erase}) || ($task{uninstall})) { ## Create an object to hold the organism attribtues $organism = new RSAT::organism(); # $organism->check_name($organism_short_name, 1); $organism->force_attribute("ID", $organism_short_name); $organism->set_attribute("feature_types", '*'); ################################################################ ## If the organism was previously supported, ensure that the ## serialized files are suppressed. &RSAT::message::Info("Cleaning previous serialized files") if ($main::verbose >= 3); $organism->delete_serial_files(); ################################################################ ## Automatic full name specification my $organism_full_name = ""; unless ($organism_full_name{organism_short_name}) { if ($user_specified_orgname) { ## Full name specified with option -organism on the command line $organism_full_name = $user_specified_orgname; } else { ## Build organism full name from the organism short name (replace _ by spaces) $organism_full_name = $organism_short_name; if (($source eq 'ensemblgenomes') || ($source eq 'ensembl')){ $organism_full_name = (defined $species[0] && !($species[0] eq '')) ? ucfirst($species[0]) : $organism_short_name; @names = split($organism_full_name."_", $organism_short_name); $organism->set_attribute("genome_assembly", $names[1]); } $organism_full_name =~ s/\_/ /g; } } # $organism->set_attribute("name", $organism_short_name); $organism->set_attribute("name", $organism_full_name); $organism_full_name{$organism_short_name} = $organism_full_name; ################################################################ ## Installation directories and files umask 0002; ## TO DO (JvH, 2017-12-12): below and elsewhere in the script, ## replace the global hash tables by attributes set on the ## organism object; if ($install_dir) { $dir{install} = $install_dir; &RSAT::message::Info("Installing genome in directory specified on the command line", $dir{install}) if ($main::verbose >= 3); } elsif ($supported_organism{$organism_short_name}->{'data'}) { $dir{install} = $supported_organism{$organism_short_name}->{'data'}; &RSAT::message::Info("Installing genome in directory previously specified in the config file", $dir{install}) if ($main::verbose >= 3); } else { $dir{install} = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name; &RSAT::message::Info("Installing genome in default directory", $dir{install}) if ($main::verbose >= 3); } $dir{genome} = $dir{install}."/genome"; $dir{oligos} = $dir{install}."/oligo-frequencies"; $outfile{features} = $dir{genome}."/feature.tab"; $outfile{synonyms} = $dir{genome}."/feature_names.tab"; $outfile{genome} = $dir{genome}."/contigs.txt"; $outfile{fasta_genome} = $dir{genome}."/".$organism_short_name.".dna.genome.fa"; $outfile{chrom_sizes} = $dir{genome}."/".$organism_short_name."_chrom_sizes.txt"; $outfile{fasta_genome_rm} = $dir{genome}."/".$organism_short_name.".dna_rm.genome.fa"; ## Create output directoryies if required unless ($dry) { &RSAT::util::CheckOutDir($dir{install}); &RSAT::util::CheckOutDir($dir{genome}); &RSAT::util::CheckOutDir($dir{oligos}); } } ################################################################ ###################### Installation tasks ###################### ################################################################ ## Open an output stream for messages &OrgParameters() if ((scalar(@organisms) ==1) || ($main::verbose >= 3)); &Erase() if ($task{erase}); &Uninstall() if ($task{uninstall}); unless (($task{uninstall}) || ($task{erase})) { if ($task{config}) { $config_ok = &UpdateConfig(); next unless ($config_ok); } &CreatePhylogeny() if ($task{phylogeny}); &StartAndStopCodons() if ($task{start_stop}); &IntergenicSegments() if ($task{genome_segments}); &AllUpstream() if ($task{allup}); &FastaGenomeAndBedtoolsIndex() if ($task{fasta_genome} || $task{chrom_sizes} || $task{index_bedtools}); ## Suppress empty lines in protein sequence files our $peptidic_seq_file = $dir{genome}."/".${organism_short_name}."_aa.fasta"; &CleanProteinSequences($peptidic_seq_file) if ($task{protein_seq}); ## Compute length distribution of protein sequence if ($task{protein_len}) { my $command = &SeqLengthDistributionCmd($peptidic_seq_file, "fasta", "protein", 50, 2000); &doit($command, $dry, $die_on_error, $main::verbose); } &CalcFrequencies() if (($task{oligos}) || ($task{distrib}) || ($task{dyads})); # New name for peptidic sequence file (Sept 2015). I suppress # organism name from sequence file name. # my $peptidic_seq_file = "peptidic_sequences.fasta"; # unless (-e $dir{genome}."/".$peptidic_seq_file) { ## Previous name for the peptidic sequence file # $peptidic_seq_file = ${organism_short_name}."_aa.fasta"; # } &CleanUp() if ($task{clean}); ## Touch the installation dir to indicate the last modification date $dir{main}; chdir($dir{main}); &RSAT::message::Info("Working dir", $ENV{PWD}) if ($main::verbose >= 4); system "touch $dir{install}"; } } &Finish(); } ################################################################ ################## SUBROUTINE DEFINITIONS #################### ################################################################ sub Finish { my $exec_time = &RSAT::util::ReportExecutionTime($start_time); print $main::out $exec_time if ($main::verbose >= 1); close $out if ($outputfile); exit(0); } ################################################################ ## Display verbosity message sub Verbose { print $out "; install-organism "; &PrintArguments($out); print $out ";\n; Tasks:\n"; foreach my $task (sort(keys(%task))) { print $out ";\t$task\n"; } print $out ";\n; Supported organism table\n;\t", $main::org_table, "\n"; } ################################################################ ## Print organism parameters sub OrgParameters { print $out ";\n; Organism parameters:\n"; if ($infile{organisms}) { printf $out "; %-25s\t%s\n", "organism file", $infile{organisms}; } else { printf $out "; %-25s\t%s\n", "ID", $organism_short_name; printf $out "; %-25s\t%s\n", "Name", $organism_full_name{$organism_short_name}; ## THIS IS WRONG (JvH, 2017-12-12) printf $out "; %-25s\t%s\n", "Update date", $install_date; printf $out "; %-25s\t%s\n", "Data source", $source; printf $out "; %-25s\t%s\n", "strain", $strain; if ($strain_info{$strain}) { for my $field (qw(assembly_accession taxid species_taxid organism_name infraspecific_name version_status assembly_level seq_rel_date asm_name group genome_size genome_size_ungapped contig_count total_gene_count protein_coding_gene_count non_coding_gene_count )) { printf $out "; %-25s\t%s\n", $field, $strain_info{$strain}->{$field}; } } } ## Directories print $out "; Directories\n"; for my $d (sort(keys(%dir))) { printf $out "; %-25s\t%s\n", $d, $dir{$d}; } ## Input files print $out "; Input files\n"; for my $f (sort(keys(%infile))) { printf $out "; %-25s\t%s\n", $f, $infile{$f}; } ## Output files print $out "; Output files\n"; for my $f (sort(keys(%outfile))) { printf $out "; %-25s\t%s\n", $f, $outfile{$f}; } } ################################################################ ## Uninstall an organism by deleting its row from the configuration ## file. This does not clean the organism directory on the hard drive ## (which requires the task "erase"). sub Uninstall { foreach our $organism_short_name (@organisms) { &RSAT::OrganismManager::check_name($organism_short_name, 1); if (defined($supported_organism{$organism_short_name})) { &RSAT::message::Warning("Uninstalling organism", $organism_short_name) if ($main::verbose >= 1); delete($supported_organism{$organism_short_name}); } } &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table); } ################################################################ ## Erase an organism by deleting its whole folder in ## RSAT/public_html/data/genomes directory. sub Erase { foreach our $organism_short_name (@organisms) { &RSAT::message::Warning("Erasing organism", $organism_short_name) if ($main::verbose >= 1); ## Identify the data directory for the organism my $organism_data_dir = ""; if ((defined($supported_organism{$organism_short_name})) && (defined($main::supported_organism{$organism_short_name}->{'data'}))) { $organism_data_dir = $main::supported_organism{$organism_short_name}->{'data'}; } else { $organism_data_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name; } ## Remove the directory if (-e $organism_data_dir) { &RSAT::message::Warning("Erasing directory", $organism_data_dir) if ($main::verbose >= 2); &doit("rm -rf ".$organism_data_dir, $dry, $die_on_error, $main::verbose); } else { &RSAT::message::Warning("No need to erase unexisting directory", $organism_data_dir); } } } ################################################################ ## Update configuration table for the current organism sub UpdateConfig { ## Read taxonomy from the current organism my $organism_table = $dir{install}."/genome/organism.tab"; my $id = $null; $taxid = $null; $taxonomy = $null; unless (-e $organism_table) { if ($main::die_on_error) { &RSAT::error::FatalError("Missing organism table", $organism_table); } else { &RSAT::message::Warning("Missing organism table", $organism_table); return(0); } } ($org_handle) = &OpenInputFile($organism_table); my $taxid_field = 1; ## Default field for the taxonomic ID my $taxonomy_field = 2; ## Default field for the taxonomy while (<$org_handle>) { chomp(); next unless (/\S/); if (/^-- field\s+(\d) taxonomy/) { $taxonomy_field = $1; } elsif (/^-- field\s+(\d) taxid/) { $taxid_field = $1; } elsif (/^--/ || /^;/ || /^#/) { #; are required to parse organism.tab created by parse_gtf next; } else { @fields = split "\t"; $id = $fields[0]; ## Get taxonomy $taxonomy = $fields[$taxonomy_field - 1]; ## Check if taxid is conform (must be a number) $taxid = $fields[$taxid_field - 1]; if ($taxid =~ /^\d+$/) { $supported_organism{$organism_short_name}->{'taxid'} = $taxid; &RSAT::message::Info($organism_short_name, "TAXID", $taxid) if ($main::verbose >= 3); } else { &RSAT::message::Warning($taxid, "Invalid taxid in organism description table", $organism_table); $taxid = $null; } &RSAT::message::Info ("Parsed taxonomy from organism.tab", "id", $id, "taxid", $taxid, "taxonomy", $taxonomy) if ($main::verbose >= 3); } } close $org_handle; ################################################################ ## Define default limits of upstream region for retrieve-seq ## depending on the taxonomic group unless (defined($up_from)) { ## If already defined in the file supported-organism.tab, keep ## previously defined value (might have been specifically tuned ## for some reason). if (defined($supported_organism{$organism_short_name}->{'up_from'})) { $up_from = $supported_organism{$organism_short_name}->{'up_from'};; } else { ## Taxon-specific default values for sequence lengths if ( ($taxonomy =~ /^Bacteria;/) || ($taxonomy =~ /^Archaea;/) || ($taxonomy =~ /;\s*Bacteria;/) || ($taxonomy =~ /;\s*Archaea;/) ) { $up_from=-400; } elsif (($taxonomy =~ /^Viruses;/) || ($taxonomy =~ /;\s*Viruses;/) ) { $up_from=-400; } elsif ($taxonomy =~ /;\s*Fungi;/) { $up_from=-800; } elsif ($taxonomy =~ /;\s*Nematoda;/) { $up_from=-3000; } elsif ($taxonomy =~ /;\s*Insecta;/) { $up_from=-3000; } elsif ($taxonomy =~ /;\s*Metazoa/) { $up_from=-3000; } elsif ($taxonomy =~ /;\s*Viridiplantae/) { $up_from=-3000; } else { $up_from=-1000; } } } unless (defined($up_to)) { if (defined($supported_organism{$organism_short_name}->{'up_to'})) { $up_to = $supported_organism{$organism_short_name}->{'up_to'}; } else { $up_to = -1; } } &RSAT::message::Info("Upstream region limits from", $up_from, "to", $up_to) if ($main::verbose >= 3); &UpdateConfigTab() unless (($dry) || ($batch)); return(1); } ################################################################ ## Update the perl config file sub UpdateConfigPerl { my $comment_previous_config = 0; if ($local_config) { $config_to_update = $ENV{'RSA_LOCAL_CONFIG'} ; } else { $config_to_update = $ENV{RSAT}."/public_html/data/supported_organisms.pl"; } ## Check if the organism was already installed before if (($main::verbose >= 4) && defined($supported_organism{$organism_short_name}->{'genome'})) { &RSAT::message::Warning($organism_short_name, "already defined in the config file", $config_to_update) ; } ## read previous config open CONFIG, $config_to_update; while ($line = ) { chomp $line; last if ($line =~ /return/); if (($line =~ /supported_organism\{\'$organism_short_name\'\}/) && ($line !~ /^\#/)) { ## comment the previous config if ($comment_previous_config) { $previous_config .= "# ${line} # reinstalled on\t${date}\n"; } } else { $previous_config .= $line."\n"; } } close CONFIG; ## Write updated config file &RSAT::message::Info ("Updating supported organisms", $config_to_update) if ($main::verbose >= 3); open CONFIG, ">$config_to_update" || die "Error: cannot write config file $config_to_update\n"; print CONFIG $previous_config; my $new_org_config = "\n#### $organism_short_name\t$organism_full_name{$organism_short_name}\t$install_date\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'name'} = \"$organism_full_name{$organism_short_name}\";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'data'} = \"$dir{install}\";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'last_update'} = \"".$install_date."\";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'source'} = \"$source\";\n"; ## OLIVIER SAND SHOULD CHECK IF THIS RESTRICTION FOR ensembl IS STILL VALID unless ($source =~ 'ensembl') { $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'features'} = \"$outfile{features}\";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'genome'} = \"$outfile{genome}\";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'seq_format'} = \"filelist\";\n"; } $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'taxonomy'} = \"$taxonomy\";\n"; if (defined($outfile{synonyms})) { $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'synonyms'} = \"$outfile{synonyms}\";\n"; } $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'up_from'} = ".$up_from.";\n"; $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'up_to'} = ".$up_to.";\n"; ## Replace absolute paths by relative paths ## $new_org_config =~ s|$ENV{RSAT}\/|\$ENV\{RSAT\}\/|g; $new_org_config =~ s|$ENV{RSAT}|\$ENV\{RSAT\}\/|g; $new_org_config =~ s|\/\/|/|g; $download_date = &AlphaDate(); $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'download_date'} = ".$download_date.";\n"; print CONFIG $new_org_config; print CONFIG "\nreturn 1;\n"; close CONFIG; } ################################################################ ## Update the tab-delimited file describing the list of supported ## organisms sub UpdateConfigTab { my (%args) = @_; &RSAT::message::Info ("Updating supported organisms", $main::org_table) if ($main::verbose >= 3); ## Update the hash variable for the new organism $supported_organism{$organism_short_name}->{'name'} = $args{name} || $organism->get_attribute("name"); $supported_organism{$organism_short_name}->{'data'} = $args{data} || $dir{install}; $supported_organism{$organism_short_name}->{'last_update'} = $args{last_update} || $install_date; $supported_organism{$organism_short_name}->{'genome_assembly'} = $organism->get_attribute("genome_assembly"); $supported_organism{$organism_short_name}->{'download_date'} = &AlphaDate() if ($task{download}); $supported_organism{$organism_short_name}->{'update_date'} = &AlphaDate(); # &RSAT::message::Debug($organism, $organism_short_name, $organism->get_attribute("name"), $install_date) if ($main::verbose >= 10); ## Change source if requested if (($main::source) && ($main::source ne $null)) { $supported_organism{$organism_short_name}->{'source'} = $main::source; } elsif (($args{source}) && ($args{source} ne $null)) { $supported_organism{$organism_short_name}->{'source'} = $args{source}; } $supported_organism{$organism_short_name}->{'features'} = $args{features} || $main::outfile{features}; $supported_organism{$organism_short_name}->{'genome'} = $args{genome} || $main::outfile{genome}; $supported_organism{$organism_short_name}->{'seq_format'} = $args{seq_format} || "filelist"; $supported_organism{$organism_short_name}->{'taxonomy'} = $args{taxonomy} || $main::taxonomy; $supported_organism{$organism_short_name}->{'taxid'} = $args{taxid} || $main::taxid; if (defined($main::outfile{synonyms})) { $supported_organism{$organism_short_name}->{'synonyms'} = $args{synonyms} || $main::outfile{synonyms}; } $supported_organism{$organism_short_name}->{'up_to'} = $args{up_to} || $main::up_to; if (($main::up_from) && ($main::up_from ne $null)) { $supported_organism{$organism_short_name}->{'up_from'} = $main::up_from; } elsif (($args{up_from}) && ($args{up_from} ne $null)) { $supported_organism{$organism_short_name}->{'up_from'} = $args{up_from}; } ## Update blast column in config table if blast information is available my $blast_folder = $supported_organism{$organism_short_name}->{'data'}."/blast_hits" ; &RSAT::message::Info($blast_folder) if ($main::verbose >= 2); if ( -d $blast_folder ){ $supported_organism{$organism_short_name}->{'blast_available'}= 1; } else { $supported_organism{$organism_short_name}->{'blast_available'}= 0; } ## Update variant columns in config table if variant information my $variants_folder=$supported_organism{$organism_short_name}->{'data'}."/variations" ; &RSAT::message::Info($variants_folder) if ($main::verbose >= 2); if ( -d $variants_folder ){ $supported_organism{$organism_short_name}->{'variant_available'}= 1; my $variant_source = `cat $variants_folder/source.txt`; chomp($variant_source); $supported_organism{$organism_short_name}->{'variant_source'}= $variants_source || ""; $supported_organism{$organism_short_name}->{'path_to_variant_files'}= $variants_folder; } else { $supported_organism{$organism_short_name}->{'variant_available'}= 0; $supported_organism{$organism_short_name}->{'variant_source'}= ""; $supported_organism{$organism_short_name}->{'path_to_variant_files'}= ""; } ## Update organism configuration table &RSAT::message::Debug("&UpdateConfigTab()", $supported_organism{$organism_short_name}->{'source'}, $supported_organism{$organism_short_name}->{'taxid'}, $supported_organism{$organism_short_name}->{'ID'}, $supported_organism{$organism_short_name}->{'last_update'}, $supported_organism{$organism_short_name}->{'up_from'}, $supported_organism{$organism_short_name}->{'name'}, $supported_organism{$organism_short_name}->{'variant_available'}, $supported_organism{$organism_short_name}->{'variant_source'}, $supported_organism{$organism_short_name}->{'path_to_variant_files'} ) if ($main::verbose >= 3); ## Export the updated table of supported organisms &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table); } ################################################################ ## Rewrite the table of supported organisms on RSAT sub RewriteConfigTab { my $supported_organism_file = &Get_supported_file(); my $new_file = ''; if (-f $supported_organism_file) { my ($s_o_file) = &OpenInputFile($supported_organism_file); ## Read supported_organisms_ensembl.tab my $data_dir = &Get_data_dir(); open ($fen, $data_dir."/supported_organisms_ensembl.tab"); my %ensembl = (); while (my $row = <$fen>){ chomp($row); my @fields = split("\t", $row); $ensembl{$fields[0]} = "\t".$fields[2]."\t".$fields[4]."\t".$fields[5]; #print $fields[0]; } close($fen); my %blast = (); my %variations = (); while(<$s_o_file>){ my @fields = split("\t"); my $key = $fields[0]; if(-d $data_dir."/genomes/".$key."/blast_hits"){ $blast{$key} = 1; }else{ $blast{$key} = 0; } $variations{$key} = (); if(-d $data_dir."/genomes/".$key."/variations"){ $variations{$key}{"variant_available"} = 1; my $variant_source = `cat $variants_folder/source.txt`; chomp($variant_source); $variations{$key}{"source"} = $variant_source || ""; $variations{$key}{"path"} = &RSAT::util::hide_RSAT_path($data_dir."/genomes/".$key."/variations"); }else{ $variations{$key}{"variant_available"} = 0; $variations{$key}{"source"} = ""; $variations{$key}{"path"} = ""; } } close($s_o_file); my ($s_o_file) = &OpenInputFile($supported_organism_file); ## Read the whole file of supported organisms from ensembl, ## and store species description lines in a hash indexed by ## full species ID, in order to sort them after having changed ## the current species fields. open($fnew, ">", $data_dir."/supported_organisms_new.tab"); while (my $row = <$s_o_file>) { next if (/^;/); ## Skip comment lines if($row =~ /^#/){ my @fields = split("\t"); print $fnew "#ID\tname\ttaxid\tsource\tlast_update\tnb\tseq_format\tup_from\tup_to\ttaxonomy\tdata\tgenome\tgenome_assembly\tgenome_version\tdownload_date\tvariant_available\tvariant_source\tpath_to_variant_files\tblast_available\n"; }else{ chomp($row); my @fields = split("\t", $row); my $full_species_id = $fields[0]; #check if the species is already in the list my $newline = $row; if(defined $ensembl{$full_species_id}){ $newline .= $ensembl{$full_species_id}; }else{ $newline .= ("\t")x3; } $newline .= "\t".$variations{$full_species_id}{"variant_available"}."\t".$variations{$full_species_id}{"source"}."\t".$variations{$full_species_id}{"path"}."\t".$blast{$full_species_id}."\n"; print $fnew $newline; } } close $s_o_file; system("mv " . $data_dir . "/supported_organisms_new.tab " . $data_dir . "/supported_organisms.tab"); } } ################################################################ ## Create a directory for the taxonomic group of the organism sub CreatePhylogeny { my $taxonomy = ""; &RSAT::message::Warning("Phylogeny indexing has been disabled"); return; ## read taxonomy from the parsing result my $organism_table = $dir{install}."/genome/organism.tab"; ($org_handle) = &OpenInputFile($organism_table); while (<$org_handle>) { chomp; if (/^-- field (\d) taxonomy/) { $taxonomy_field = $1; } elsif (/^--/) { next; } else { if (defined($taxonomy_field)) { @fields = split "\t"; $taxonomy = $fields[$taxonomy_field - 1]; &RSAT::message::Info ("Taxonomyyy\t$taxonomy\n") if ($main::verbose >= 3); } else { &Warning("Cannot read taxonomy in file $organism_table\n"); } } } close $org_handle; if ($taxonomy) { $taxonomy = &trim($taxonomy); $taxonomy =~ s|\s*;\s*|/|g; ## Each taxonomic level becomes a subdirectorry $taxonomy =~ s|\s+|_|g; ## I prefer to avoid spaces in directory names $taxonomy =~ s|$|_|g; ## this character cannot be used for a directory name $taxonomy =~ s|$|_|g; ## this character cannot be used for a directory name $taxonomy =~ s|\,|.|g; ## Not fatal, but usually not found in folder names. $taxonomy =~ s|\:|.|g; ## Not fatal, but usually not found in folder names. $dir{taxonomy} = $ENV{RSAT}."/public_html/data/phylogeny/".$taxonomy; $dir{taxonomy} =~ s|//|/|g; my ($org_dir) = &ShortFileName($dir{install}); &RSAT::util::CheckOutDir($dir{taxonomy}) unless ($dry); if ($main::verbose >= 3) { &RSAT::message::Info("Taxonomy directory", $dir{taxonomy}); &RSAT::message::Info("Organism directory", $org_dir); &RSAT::message::Info("Link to directory", $dir{install}); } &doit("cd $dir{taxonomy}; rm $org_dir; ln -s $dir{install} .",0,0,$main::verbose); } else { &RSAT::message::Warning("Cannot identify taxonomy in table ".$organism_table); } } ################################################################ ### extract the non-redudant set of intergenic and gene sequences sub IntergenicSegments { &RSAT::message::TimeWarn("&IntergenicSegments()") if ($main::verbose >= 3); chdir $dir{genome}; ## Retrieve sequences for different subtypes of genome regions &RSAT::message::TimeWarn("coding-or-not: extracting sequences for different types of genomic regions") if ($main::verbose >= 2); my $command = "coding-or-not "; $command .= "-v 1 " if ($main::verbose >= 1); $command .= "-org $organism_short_name -return ncs,cs,pos,seq,stats"; # &doit($command, $dry, $die_on_error, $main::verbose); my @types = (); push @types, "gene"; push @types, "intergenic"; foreach my $seq_type (@types) { my $seq_file = $dir{genome}."/".$organism_short_name."_".$seq_type."_segments.fasta"; ## Draw sequence length distributions &RSAT::message::TimeWarn("Computing sequence length for genomic regions", $seq_type) if ($main::verbose >= 2); if ($task{seq_len_distrib}) { $command .= "; " unless ($command eq ""); $command .= &SeqLengthDistributionCmd($seq_file, "fasta", $seq_type, 50, 2000); } ## Compress sequence file if (-e $seq_file) { &RSAT::message::TimeWarn("Compressing sequences for genomic regions", $seq_type) if ($main::verbose >= 2); $command .= "; " unless ($command eq ""); $command .= "gzip -f $seq_file"; ## Purge sequences if ($purged_frequencies) { &RSAT::message::TimeWarn("Purging sequences for genomic regions", $seq_type) if ($main::verbose >= 2); $command .= "; " unless ($command eq ""); $command .= &PurgeSeqCommand($seq_file); } # &doit($command, $dry, $die_on_error, $main::verbose); &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } chdir $dir{main}; } ################################################################ ### extract the complete set of upstream sequences sub AllUpstream { &RSAT::message::TimeWarn("Retrieving all upstream sequences") if ($main::verbose >= 2); foreach my $masking (@masking_modes) { foreach my $noorf ("", "-noorf") { my $seq_type = "upstream${noorf}${masking}"; my $seq_file = $dir{genome}."/".${organism_short_name}."_".${seq_type}; my @seq_formats = qw(fasta ft); # my @seq_formats = qw(ft); my $command = ""; ## Retrieve sequences in two formats: fasta is used for ## computing oligo and dyad frequencies, whereas ft is convenient for ## retrieve-seq-multigenome foreach $format (@seq_formats) { $command .= "; " unless ($command eq ""); $command .= "retrieve-seq -all"; $command .= " -type upstream -feattype gene -org ".$organism_short_name; $command .= " ".${noorf}; $command .= " ".${masking}; $command .= " -label ID"; $command .= " -format ".$format; $command .= " -o ".$seq_file.".".$format; &RSAT::message::Info("Exported sequence file", $seq_file.".".$format) if ($main::verbose >= 3); } ## Compute sequence length distributions and generate a frequency plot $command .= "; ".&SeqLengthDistributionCmd($seq_file.".fasta", "fasta", $seq_type, 50, 2000) if ($noorf); ## Purge sequences if ($purged_frequencies) { $command .= "; "; $command .= &PurgeSeqCommand($seq_file.".fasta", "fasta"); } ## Compress sequence file $command .= "; gzip -f ".$seq_file.".fasta"; &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } chdir $dir{main}; } ################################################################ ## Convert genome sequence to fasta, required for retrieve-seq-bed and ## bedtools, and generate the fasta index file (.fai). sub FastaGenomeAndBedtoolsIndex { &RSAT::message::TimeWarn("Generating fasta sequences and bedtools fasta index. ") if ($main::verbose >= 2); my $command = ""; ## Generate fasta sequence if ($task{fasta_genome}) { $command .= "convert-seq"; $command .= " -i ".$outfile{genome}; $command .= " -from filelist -to fasta"; $command .= " -o ".$outfile{fasta_genome}; # &doit($command, $dry, $die_on_error, $main::verbose); &RSAT::message::Info("\tFasta genome", $outfile{fasta_genome}) if ($main::verbose >= 3); ## JvH (2017-11-02): I don't remember why I made a soft link to ## dna.genome.fa instead of generating the repeat-masked genome. # $command .= "; cd ".$dir{genome}. "; ln -f -s ".$organism_short_name.".dna.genome.fa ".$organism_short_name.".dna_rm.genome.fa; cd -"; # &doit($command, $dry, $die_on_error, $main::verbose); } ## Convert soft-masked genome into hard-masked genome if ($task{fasta_genome_rm}) { $command .= "; convert-seq"; $command .= " -i ".$outfile{fasta_genome}; $command .= " -from fasta -to fasta -mask lower"; $command .= " -o ".$outfile{fasta_genome_rm}; } ## Compute chromosome sizes if ($task{chrom_sizes}) { $command .= "; " if ($command); $command .= "sequence-lengths -i ".$outfile{fasta_genome}; $command .= " -o ".$outfile{chrom_sizes}; &RSAT::message::Info("\tChromsome sizes", $outfile{chrom_sizes}) if ($main::verbose >= 3); } ################################################################ ## Generate bedtool indexes. This must be done as RSAT user in order ## to enable retrieve-seq-bed, which relies on bedtools getfasta. ## ## We use a trick to run bedtools getfasta with a single-nucleotide bed ## file, whihc creates the fasta index if it does not exist. . if ($task{index_bedtools}) { my $rand_fragment_coord = &RSAT::util::make_temp_file("","random-genome-fragments_coord", 1); my $rand_fragment_seq = &RSAT::util::make_temp_file("","random-genome-fragments_seq", 1); $command .= "; " if ($command); $command .= &RSAT::server::GetProgramPath("random-genome-fragments"); $command .= " -n 1 -l 1"; $command .= " -org ".$organism_short_name; $command .= " -o ".$rand_fragment_coord; $command .= " ; "; $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed"); $command .= " -i ".$rand_fragment_coord; $command .= " -org ".$organism_short_name; $command .= " -o ".$rand_fragment_seq; $command .= " ; "; $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed"); $command .= " -rm"; $command .= " -i ".$rand_fragment_coord; $command .= " -org ".$organism_short_name; $command .= " -o ".$rand_fragment_seq; $command .= " ; rm -f ".$rand_fragment_seq; $command .= " ; rm -f ".$rand_fragment_coord; &RSAT::message::Info("\tFasta index", $outfile{fasta_genome}.".fai") if ($main::verbose >= 3); #&doit($command, $dry, $die_on_error, $main::verbose); } &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } ################################################################ ## Draw an histogram of sequence lengths sub SeqLengthDistributionCmd { my ($seq_file, $format, $seq_type, $ci, $xmax) = @_; &RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $format, $seq_type, $ci, $xmax) if ($main::verbose >= 2); $ci = 50 unless $ci; $format = "fasta" unless $format; my $classfreq_from = 0; chdir $dir{genome}; if ($main::verbose >= 4) { my $pwd = $ENV{PWD}; #2D my $pwd = `pwd`; # chomp $pwd; &RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $seq_type, "Working dir", $pwd); } my $len_prefix = $dir{genome}."/".$organism_short_name."_".$seq_type."_lengths"; my $command = "sequence-lengths -i $seq_file -in_format ".$format; $command .= " -o ${len_prefix}.tab"; $command .= "; classfreq -v 1 -i ${len_prefix}.tab "; $command .= " -col 2 -ci ".$ci." -from ".$classfreq_from; $command .= " -o ".$len_prefix."_distrib.tab"; $command .= "; XYgraph -i ${len_prefix}_distrib.tab"; $command .= " -xcol 3 -ycol 4,5,6 -lines -xmin $classfreq_from"; $command .= " -xsize 800 -ysize 400 -legend"; $command .= " -xgstep1 200 -xgstep2 50 -xmin 0"; $command .= " -xmax ".$xmax; $command .= " -ymin 0 -ygstep2 1000"; $command .= " -xlog 2" if( ($seq_type eq "intergenic") || ($seq_type eq "gene")); $command .= " -xleg1 'Sequence length (bp)'"; $command .= " -yleg1 'Frequency'"; $command .= " -title1 '".$organism_full_name{$organism_short_name}."'"; $command .= " -title2 'length distribution of ${seq_type} sequences'"; $command .= " -format ".$img_format; $command .= " -o ".$len_prefix."_distrib.".$img_format; # &doit($command, $dry, $die_on_error, $main::verbose); return($command); } ################################################################ ## purge sequences sub PurgeSeqCommand { my ($seq_file, $format) = @_; &RSAT::message::Warning("Purging sequence", $seq_file, $format) if ($main::verbose >= 2); $format = "fasta" unless ($format); my $purged_seq_file = `basename $seq_file $format`; $purged_seq_file .= "_purged.$format"; my $command = "purge-sequence -i $seq_file -format $format -ml 300 -mis 9 -2str -o $purged_seq_file"; # &doit($command, $dry, $die_on_error, $main::verbose); ## compress purged sequence file $command .= "; gzip -f $purged_seq_file"; # &doit($command, $dry, $die_on_error, $main::verbose); return($command); } ################################################################ ### calculate oligo and dyad frequencies in different sequence types : ### - intergenic ### - upstream ### - genomic ### sub CalcFrequencies { # my ($seq_file, $seq_format, $seq_type) = (); &RSAT::message::TimeWarn("&CalcFrequencies()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2); chdir $dir{oligos}; ################################################################ ## Calculate oligonucleotide and dyad frequencies in all upstream ## sequences if ($task{upstream_freq}) { foreach my $masking (@masking_modes) { foreach my $noorf ("-noorf", "") { $seq_type = "upstream${noorf}${masking}"; $seq_format = "fasta"; $seq_file = $dir{genome}."/".$organism_short_name."_".$seq_type; if ($purged_frequencies) { $seq_file .= "_purged"; } $seq_file .= ".fasta"; unless (-e $seq_file) { if (-e $seq_file.".gz") { $seq_file = $seq_file.".gz"; } else { &RSAT::error::FatalError("Cannot compute upstream frequencies because upstream sequence file is missing:\t", $seq_file); } } &RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies", $noorf, $masking, $seq_file) if ($main::verbose >= 2); &CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos}); &CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf)); &CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads}); } } } ################################################################ ## Specific tasks for genomes downloaded from Ensembl if ($task{ensembl_freq}) { unless (defined($up_from)) { if (defined($supported_organism{$organism_short_name}->{'up_from'})) { $up_from = $supported_organism{$organism_short_name}->{'up_from'};; } } unless (defined($up_to)) { if (defined($supported_organism{$organism_short_name}->{'up_to'})) { $up_to = $supported_organism{$organism_short_name}->{'up_to'}; } } foreach my $masking (@masking_modes) { # foreach my $masking ("-rm") { # foreach my $noorf ("-noorf", "") { foreach my $maskcoding ("-maskcoding") { # foreach my $type ("upstream_mrna") { # foreach my $type ("intron") { # foreach my $type ("firstintron") { foreach my $type ("utr") { # $seq_type = "${type}${up_from}${up_to}${maskcoding}${masking}"; $seq_type = "${type}${maskcoding}${masking}"; # if ($purged_frequencies) { # $seq_file = "${organism_short_name}_${seq_type}_purged.fasta"; # $seq_format = "fasta"; # } else { $seq_file = "${organism_short_name}_${seq_type}.fasta"; $seq_format = "fasta"; # } &RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies") if ($main::verbose >= 1); &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos}); # &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf)); # &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads}); } # } } } } ################################################################ ## Calculate oligont frequencies in all intergenic sequences if ($task{intergenic_freq}) { $seq_type = "intergenic"; &RSAT::message::TimeWarn("Calculating ${seq_type} oligo and dyad frequencies") if ($main::verbose >= 1); if ($purged_frequencies) { $seq_file = "${organism_short_name}_intergenic_segments_purged.fasta"; $seq_format = "fasta"; } else { $seq_file = "${organism_short_name}_intergenic_segments.fasta"; $seq_format = "fasta"; } &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos}); # &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib}); &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads}); } ################################################################ ## Calculate oligo-frequencies in full genome if ($task{genome_freq}) { $seq_type = "genomic"; ## First check if a fasta file exists for the current genome ## (since 2015-11, fasta genomes are downloaded from ## ensemblgenomes). If yes, use it in priority because it allows ## to use the optiojn -quick for oligo-analysis and dyad-analysis. my $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name."/genome"; my $fasta_genome = $genome_dir."/".$organism_short_name.".dna.genome.fa"; if (-e $fasta_genome) { $seq_file = $fasta_genome; $seq_format = "fasta"; &CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos}); # &CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib}); &CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads}); } else { &RSAT::message::Warning("Missing fasta genome file", $fasta_genome); &RSAT::message::Warning("Fasta genome file not found -> skipping whole genome oligo/dyad frequencies."); $seq_format = "filelist"; $seq_file = $outfile{'genome'}; } #&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos}); # &CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib}); #&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads}); } ################################################################ ## calculate oligont frequencies in all gene sequences ## ## GENE FREQUENCIES ARE NOT WORKING ANYMORE, AND, BESIDES, THEY WERE NOT ## USEFUL if ($task{gene_freq}) { $seq_type = "gene"; if ($purged_frequencies) { $seq_file = "${organism_short_name}_gene_segments_purged.fasta"; $seq_format = "fasta"; } else { $seq_file = "${organism_short_name}_gene_segments.fasta"; $seq_format = "fasta"; } &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos}); # &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib}); &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads}); } ################################################################ # Calculate oligopeptide frequencies in all protein sequences if ($task{protein_freq}) { $seq_file = $peptidic_seq_file; if (-e $peptidic_seq_file) { $seq_type = "protein"; $seq_format = "fasta"; &RSAT::message::TimeWarn("Calculating oligopeptide frequencies") if ($main::verbose >= 2); &CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos}); } else { &RSAT::message::Warning("Skipping oligopeptide frequencies because peptidic sequence file is missing", $seq_file); } } chdir $dir{main}; } ################################################################ ## Calculate oligonucleotide frequencies for a specified sequence ## file sub CalcOligoFreq { my ($seq_file, $seq_format, $seq_type) = @_; &RSAT::message::TimeWarn("&CalcOligoFreq()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2); unless ($seq_file) { &RSAT::error::FatalError("&CalcOligoFreq() called with empty sequence file name"); } unless (-e $seq_file) { &RSAT::error::FatalError("&CalcOligoFreq(): sequence file does not exist\t", $seq_file); } my @current_oligo_lengths = @oligo_lengths; # my @current_oligo_lengths = (6); my $oligo_seq_type = "dna"; my $residue_type = "nt"; my $quick = ""; my @batch_commands = (); my @strands = ("-1str", "-2str"); if ($seq_type eq "protein") { @current_oligo_lengths = 1..3; $oligo_seq_type = "prot"; $residue_type = "pept"; @strands = (""); } ## Quick option only works with DNA sequences in fasta format unless (($seq_type eq "protein") || ($seq_format ne "fasta")) { $quick = "-quick"; } ## Uncompress sequence file if required because the option -quick ## currently does not support compressed files my $uncompressed = 0; if (!(-e $seq_file) && (-e $seq_file.".gz")) { &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2); my $command = "gunzip ".$seq_file.".gz"; my $job_prefix = ${organism_short_name}."_compress"; $uncompressed = 1; &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); $seq_file =~ s/\.gz$//; } my $job_prefix = ${organism_short_name}."_oligos"; foreach my $noov ("-noov", "-ovlp") { foreach my $strands (@strands) { foreach my $oligo_length (@current_oligo_lengths) { &RSAT::message::TimeWarn("Calculating oligo frequencies",$seq_file, $seq_format, $seq_type, "l=".$oligo_length, $noov, $strands) if ($main::verbose >= 2); my $out_file = $dir{oligos}."/".${oligo_length}.${residue_type}."_".${seq_type}."_".${organism_short_name}.${noov}.${strands}.".freq"; my $command = &RSAT::server::GetProgramPath("oligo-analysis"); $command .= " ".$quick; $command .= " -v 1 "; $command .= " ".$strands; $command .= " -i ".$seq_file; $command .= " -format ".$seq_format; $command .= " -seqtype ".$oligo_seq_type; $command .= " ".$noov; $command .= " -l ".$oligo_length." -type dna "; $command .= " -return freq,occ"; $command .= " -o ".$out_file; $command .= "; gzip -f ".$out_file; if ($batch) { push @batch_commands, $command; } else { &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } } } ## Run all the oligo frequencies in one batch job if ($batch) { my $batch_command = join(" ;", @batch_commands); &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } ## Recompress the sequence file if required if ($uncompressed) { my $command = "gzip ".$seq_file; my $job_prefix = ${organism_short_name}."_uncompress"; if ($batch) { &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2); } else { &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2); &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } } ################################################################ ## Calculate oligonucleotide distribution for a specified sequence ## file sub CalcOligoDistrib { my ($seq_file,$seq_format,$seq_type) = @_; &RSAT::message::TimeWarn("&CalcOligoDistrib()", $seq_file,$seq_format,$seq_type) if ($main::verbose >= 2); @strands = ("-1str", "-2str"); foreach my $noov ("-noov", "-ovlp") { foreach my $strands (@strands) { foreach my $oligo_length (@oligo_lengths) { my $job_prefix = ${organism_short_name}."_oligo_".$oligo_length; #### Calculate occurrence distributions in the sequence file my $distrib_file = $dir{oligos}."/"."${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_distrib.tab"; my $command = &RSAT::server::GetProgramPath("oligo-analysis"); $command .= " -v 1"; $command .= " -i ".$seq_file; $command .= " -format ".$seq_format; $command .= " ".$strands; $command .= " ".$noov; $command .= " -l ".$oligo_length." -type dna"; $command .= " -return occ -distrib"; $command .= " -o ".$distrib_file; $command .= " ; gzip -f ".$distrib_file; &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); #### Fit a Poisson and a negbin on occurrence distribution foreach my $theor ("negbin", "poisson") { my $fitting_file = "$dir{oligos}/${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_${theor}.tab"; $command = "fit-distribution -v 1 -i ".$distrib_file; $command .= " -o ".$fitting_file; $command .= " -distrib ".$theor; $command .= " ; gzip -f ".$fitting_file; &doit($command, $dry, $die_on_error, $main::verbose); } } } } } ################################################################ ## calculate dyad frequencies for a specified sequence ## file sub CalcDyadFreq { my ($seq_file, $seq_format, $seq_type) = @_; &RSAT::message::TimeWarn("&CalcDyadFreq()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2); $min_spacing = 0; $max_spacing = 20; @monad_lengths = (3,2,1); @strands = ("-1str", "-2str"); my $quick = ""; my @batch_commands = (); ## Quick option only works with DNA sequences in fasta format unless (($seq_type eq "protein") || ($seq_format ne "fasta")) { $quick = "-quick"; } ## Uncompress sequence file if required because the option -quick ## currently does not support compressed files my $uncompressed = 0; if (!(-e $seq_file) && (-e $seq_file.".gz")) { &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2); my $command = "gunzip ".$seq_file.".gz"; my $job_prefix = ${organism_short_name}."_uncompress"; $uncompressed = 1; &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); $seq_file =~ s/\.gz$//; } ## Compute dyad frequencies for each sequence type, strand grouping (on or off) and monad length my $job_prefix = ${organism_short_name}."_dyads"; foreach my $noov ("-noov", "-ovlp") { foreach my $strands (@strands) { foreach my $monad_length (@monad_lengths) { &RSAT::message::TimeWarn("Calculating dyad frequencies",$seq_file, $seq_format, $seq_type, "l=".$monad_length, $noov, $strands) if ($main::verbose >= 2); $dyad_file = $dir{oligos}."/"."dyads_${monad_length}nt_sp${min_spacing}-${max_spacing}_${seq_type}_${organism_short_name}${noov}${strands}"; $dyad_file .= ".freq"; my $command = &RSAT::server::GetProgramPath("dyad-analysis"); $command .= " ".$quick; $command .= " -v 1"; $command .= " -i ".$seq_file; $command .= " -format ".$seq_format; $command .= " -timeout 240000 "; $command .= " -type any -seqtype dna"; $command .= " ".$strands; $command .= " ".$noov; $command .= " -sp ".$min_spacing."-".$max_spacing; $command .= " -l $monad_length"; $command .= " -return freq,occ"; $command .= " -o ".$dyad_file; $command .= "; gzip -f ".$dyad_file; if ($batch) { push @batch_commands, $command; } else { &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } } } ## Run all the oligo frequencies in one batch job if ($batch) { my $batch_command = join(" ;", @batch_commands); &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } ## Recompress the sequence file if required if ($uncompressed) { my $command = "gzip ".$seq_file; my $job_prefix = ${organism_short_name}."_uncompress"; if ($batch) { &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2); } else { &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2); &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } } } ################################################################ ## Display full help message sub PrintHelp { open HELP, "| more"; print HELP < performs all the formatting and calibration tasks for importing genomes from the reference databases (NCBI, EMBL) to RSAT. The program I transfers the RSAT-formatted genomes from a RSAT server. If a genome is available on the RSAT server, it is recommended to use download-genomes in order to obtain it immediately in the RSAT format, rather than install-genomes. genome-blast run blast for all the peptidic sequences of a query organism against those of a reference organism (db), and generate a table with all the detected similarities. End_of_help close HELP; exit(0); } ################################################################ ## Display short help message sub PrintOptions { open HELP, "| more"; print HELP <= 1) { $arg = shift (@arguments); # foreach my $a (0..$#ARGV) { ### verbose ### if ($arg eq "-v") { if (&IsNatural($arguments[0])) { $main::verbose = shift(@arguments); } else { $main::verbose = 1; } ## dry run } elsif ($arg eq "-n") { $dry = 1; $main::verbose = 1; ## detailed help } elsif ($arg eq "-h") { &PrintHelp(); ## list of options } elsif ($arg eq "-help") { &PrintOptions(); ## output file } elsif ($arg eq "-o") { $outputfile = shift(@arguments); ## Organism name in one word (spaces replaced by underscores) } elsif ($arg eq "-org") { my $organism_short_name = shift(@arguments); &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: cannot contain spaces") if ($organism_short_name =~ /\s/); &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should not be empty.") unless ($organism_short_name =~ /\S/); &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should contain at least one underscore.") unless ($organism_short_name =~ /_/); push @organisms, $organism_short_name; ## Species, used to define the FTP site for the task "download" } elsif ($arg eq "-species") { my $species = shift(@arguments); &RSAT::error::FatalError( $organism_short_name, "Invalid species name: cannot contain spaces") if ($species =~ /\s/); &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should not be empty.") unless ($species =~ /\S/); &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should contain at least one underscore.") unless ($species =~ /_/); push @species, $species; ## Input file with organism list } elsif ($arg eq "-species_file") { $infile{species} = shift(@arguments); ## Strain, used to define the FTP site for the task "download" } elsif ($arg eq "-strain") { $strain = shift(@arguments); ## Strain, used to define the FTP site for the task "download" } elsif ($arg eq "-max_strains") { $max_strains = shift(@arguments); &RSAT::error::FatalError($max_strains, "is not a valid value for option -max_strains. Should be a Natural number. ") unless (&RSAT::util::IsNatural($max_strains)); ## Group, used to define the FTP site for the task "download" } elsif ($arg eq "-group") { $group = shift(@arguments); ## Input file with organism list } elsif ($arg eq "-org_file") { $infile{organisms} = shift(@arguments); ## Taxon } elsif ($arg eq "-taxon") { push @taxa, shift(@arguments); ## All organisms found in the Refseq dir } elsif ($arg eq "-all_organisms") { $main::all_organisms = 1; ## Filter assemblies based on one or more criteria } elsif ($arg eq "-filter") { my $filter_field = shift(@arguments); my $filter_value = shift(@arguments); $filter{$filter_field} = $filter_value; ## Format to export lists of selected assemblies / organisms } elsif ($arg eq "-list_format") { $list_format = shift(@arguments); unless ($supported_list_format{$list_format}) { &RSAT::error::FatalError($list_format, "is not a valid value for option -list_format. Supported: ", join(", ", keys %supported_list_format)); } ## Skip the N first organisms of the list (can be useful if the ## task was interrupted). } elsif ($arg eq "-skip") { $main::skip = shift(@arguments); &RSAT::error::FatalError($main::skip, "Invalid value for option -skip, should be a Natural number") unless (&IsNatural($main::skip)); ## Stop after the N first organisms of the list (can be useful ## for tests on a subset of the list of oganisms). } elsif ($arg eq "-last") { $main::last = shift(@arguments); &RSAT::error::FatalError($main::last, "Invalid value for option -last, should be a Natural number") unless (&IsNatural($main::last)); ## Full organism name (may include spaces) } elsif ($arg eq "-organism") { $user_specified_orgname = shift(@arguments); ## synonyms } elsif ($arg =~ /-syn/) { $infile{synonyms} = shift(@arguments); ## Specify the limits of upstream regions } elsif ($arg eq "-up_from") { $up_from = shift(@arguments); &FatalError($up_from, "Invalid value for the up_from parameter (must be integer)") unless (&IsInteger($up_from)); &FatalError($up_from, "Invalid value for the up_from parameter (must be negative)") if ($up_from >= 0); } elsif ($arg eq "-up_to") { $up_to = shift(@arguments); &FatalError(join ("\t", $up_to, "Invalid value for the to parameter (must be integer)")) unless (&IsInteger($up_to)); } elsif ($arg eq "-prefid") { $parse_options .= join(" " , " -prefid", shift(@arguments), shift(@arguments)); } elsif ($arg eq "-date") { $force_date = shift(@arguments); ## installation dir } elsif ($arg =~ /^-dir/) { $install_dir = shift(@arguments); ## Refseq dir } elsif ($arg =~ /^-refseq/) { $dir{refseq} = shift(@arguments); ## Genbank dir } elsif ($arg =~ /^-genbank/) { &RSAT::error::FatalError("Option -genbank is obsolete, use -refseq instead."); ## ENSEMBL dir } elsif ($arg =~ /^-ensembl/) { $dir{ensembl} = shift(@arguments); ## data source } elsif ($arg eq "-source") { $source = shift(@arguments); ## Batch mode, one job per task } elsif ($arg eq "-batch") { $batch = 1; ## Batch mode, one job per organism } elsif ($arg eq "-batch_org") { $batch_org = 1; ## Queue for the job scheduler } elsif ($arg eq "-queue") { $cluster_queue = shift(@arguments); ## Masking modes } elsif ($arg eq "-rm") { $task{mask} = 1; push @masking_modes, "-rm"; ## task selection } elsif (($arg eq "-task") || ($arg =~ /^-step/)) { my @requested_tasks = split ",", shift(@arguments); foreach my $task (@requested_tasks) { next unless $task; if ($supported_task{$task}) { $task{$task} = 1; push @tasks, $task; } else { &RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks"); } } ## image format } elsif ($arg eq "-img_format") { $img_format = lc(shift(@arguments)); ## local configuration file specified with an environment variable } elsif ($arg =~ /^-local/) { unless ($ENV{'RSA_LOCAL_CONFIG'}) { die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n"; } $local_config = 1; ## alternative configuration file } elsif ($arg eq "-config") { $ENV{'RSA_LOCAL_CONFIG'} = shift(@arguments); unless ($ENV{'RSA_LOCAL_CONFIG'}) { die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n"; } $local_config = 1; ## Store a backup copy of organism table before overwriting it } elsif ($arg eq "-backup_org_table") { $main::backup_org_table = 1; =pod =item B<-dry> Dry run: print the commands but do not execute them. =cut } elsif ($arg eq "-dry") { $main::dry = 1;; =pod =item B<-nodie> Do not die in case a sub-program returns an error. The option -nodie allows you to circumvent problems with specific sub-tasks, but this is not recommended because the results may be incomplete. =cut } elsif ($arg eq "-nodie") { $main::die_on_error = 0; } } } ################################################################ ## Retrieve start and stop codons and calculate word occurrences ## (for checking) sub StartAndStopCodons { &RSAT::message::TimeWarn("&StartAndStopCodons()", $organism_short_name) if ($main::verbose >= 2); # my $label = "orf"; my $label = "id,ctg,reg_left,reg_right,orf_strand"; my $prefix = $dir{genome}."/".${organism_short_name}; my $job_prefix = ${organism_short_name}."_start_codons"; my $command = &RSAT::server::GetProgramPath("retrieve-seq"); $command .= " -v 1"; $command .= " -org ".$organism_short_name; $command .= " -all"; $command .= " -type upstream -feattype cds -from 0 -to 2 "; $command .= " -format wc -nocomment -label ".$label; $command .= " -o ".$prefix."_start_codons.wc"; $command .= " ; "; $command .= &RSAT::server::GetProgramPath("oligo-analysis"); $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort"; $command .= " -i ".$prefix."_start_codons.wc"; $command .= " -o ".$prefix."_start_codon_frequencies.tab"; # &RSAT::message::Debug($command) if ($main::verbose >= 10); &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); # &doit($command, $dry, $die_on_error, $main::verbose); &RSAT::message::Info("\tStart codons", $prefix."_start_codons.wc") if ($main::verbose >= 2); &RSAT::message::Info("\tStart codon frequencies", $prefix."_start_codon_frequencies.tab") if ($main::verbose >= 2); $job_prefix = ${organism_short_name}."_stop_codons"; $command = &RSAT::server::GetProgramPath("retrieve-seq"); $command .= " -v 1"; $command .= " -org ".$organism_short_name; $command .= " -all"; $command .= " -type downstream -feattype cds -from -2 -to 0 "; $command .= " -format wc -nocomment -label ".$label; $command .= " -o ".$prefix."_stop_codons.wc"; $command .= " ; "; $command .= &RSAT::server::GetProgramPath("oligo-analysis"); $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort"; $command .= " -i ".$prefix."_stop_codons.wc"; $command .= " -o ".$prefix."_stop_codon_frequencies.tab"; &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); # &doit($command, $dry, $die_on_error, $main::verbose); &RSAT::message::Info("\tStop codons", $prefix."_stop_codons.wc") if ($main::verbose >= 2); &RSAT::message::Info("\tStop codon frequencies", $prefix."_stop_codon_frequencies.tab") if ($main::verbose >= 2); } ################################################################ ## Parse the genome from Genbank files sub ParseGenome { my ($genus_species, $strain, $organism_name, $strain_dir, $genome_dir, $org_nb, %args) = @_; $strain_dir = $strain_info{$strain}->{"downloads"}; $organism_name = $strain_info{$strain}->{"organism"}; $genus_species = $strain_info{$strain}->{"genus_species"}; &RSAT::message::TimeWarn("Parsing genome", $org_nb, $genus_species, $strain, $organism_name, $strain_dir) if ($main::verbose >= 1); my $command = "$ENV{RSAT}/perl-scripts/parse-genbank.pl -v ".($main::verbose -1); $command .= " -i ".$strain_dir; $command .= $parse_options; if ($dir{ensembl}) { $command .= " -ext dat"; } else { $command .= " -ext gbff"; ## 2019 format in NCBI $command .= " -genomeInFasta "; ## 2019 format in NCBI has the genome in fasta version separated from the genebank format } $command .= " -org '".$organism_name."'"; # if ($source ne $null) { $command .= " -source '$source'"; # } $command .= " -o ".$genome_dir; &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue); } ################################################################ ## Clean up unnecessary files to save disk space sub CleanUp { &RSAT::message::TimeWarn("&CleanUp()") if ($main::verbose >= 2); chdir $dir{genome}; ## delete files with intergenic and gene segment sequences my @files = (); foreach my $seq_type (qw(intergenic gene upstream upstream-noorf)) { foreach my $format (qw(wc fasta)) { foreach my $extension ("", ".gz") { foreach my $segments ("", "_segments") { foreach my $purged ("", "_purged") { my $file = "${organism_short_name}_${seq_type}${segments}${purged}.${format}${extension}"; if (-e $file) { push @files, $file; } } } } } } foreach my $file (@files) { my $command = "rm -f $file"; &doit($command, $dry, $die_on_error, $main::verbose); } } ################################################################ ## Dowload the assembly file for the group sub DownloadAssemblySummary { my ($group) = @_; #============================================== old code ============================================== # again here "genomes" directory is not available in the current NCBI rsync server #my $assembly_summary_url = join("/", "rsync://ftp.ncbi.nlm.nih.gov", "genomes", "refseq", $group, "assembly_summary.txt"); #---------------------- New changes 07-0-2026: URL changed from rsync:// to https:// ---------------------------------- my $assembly_summary_url = join("/", "https://ftp.ncbi.nlm.nih.gov", "genomes", "refseq", $group, "assembly_summary.txt"); if ($main::verbose >= 1) { &RSAT::message::TimeWarn("Downloading NCBI assembly summary file for group\t", $group); &RSAT::message::Info("\t".$assembly_summary_url); } &RSAT::util::CheckOutDir($dir{downloads}); # my $command = "wget -nc ${assembly_url} --no-clobber --output-document ${assembly_file} ; "; my $command = "wget --no-clobber -O ".$dir{downloads}."/README_assembly_summary.txt https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt; "; &doit($command, $dry, $die_on_error, $main::verbose); #=============================== old block of downloading using rsync ============================================== #$command = "rsync -ruptvl -R --verbose ".$assembly_summary_url." ".$dir{downloads}; #$command = "rsync -ruptvl -R --verbose ".$assembly_summary_url." ".$dir{downloads}; #$command .= " && head -2 $outfile{assembly_summary} | tail -1 | perl -pe 's/\t/\n/g' | awk '{sum +=1; {print sum\"\t\"\$0}}' > $outfile{assembly_columns}"; # ------------------------------------ New changes 07-0-2026: downloading with wget --------------------------------------------- &RSAT::util::CheckOutDir($dir{assembly_summary}); $command = "wget -O ".$outfile{assembly_summary}." ".$assembly_summary_url; $command .= " && grep '^#assembly_accession' ".$outfile{assembly_summary} ." | sed 's/^#//' " ." | tr '\\t' '\\n' " ." | awk '{print NR\"\\t\"\$0}' > ".$outfile{assembly_columns}; &doit($command, $dry, $die_on_error, $main::verbose); ## Display file names if ($main::verbose >= 1) { &RSAT::message::TimeWarn("Downloaded group assembly summary file"); &RSAT::message::Info("Assembly summary file\t", $outfile{assembly_summary}); &RSAT::message::Info("Assembly summary file columns\t", $outfile{assembly_columns}); } my $available_assemblies = `grep -v '^#' $outfile{assembly_summary} | wc -l`; chomp($available_assemblies); printf $out ("; %-25s\t%s\n", "Assembly summary file", $outfile{assembly_summary}); printf $out ("; %-25s\t%s\n", "Assembly summary columns", $outfile{assembly_columns}); printf $out ("; %-25s\t%s\n", "Available assemblies", $available_assemblies); } ################################################################ ## Read assembly summary file as provided by NCBI sub ReadAssemblySummary { my ($file) = @_; ## Check if the file exists if (-e $file) { &RSAT::message::TimeWarn("Reading assembly summary file\t", $file) if ($main::verbose >= 1); } else { # die"HELLO"; &RSAT::error::FatalError("Missing assembly summary file\t", $file); } ## Parse the file my @header; my %strain_info; &RSAT::message::TimeWarn("Reading assembly summary table\t", $file) if ($main::verbose >= 1); my $l = 0; ## Line number in the assembly file my $selected = 0; ## Number of selected assemblies my ($asmb) = &OpenInputFile($file); my @column_index; while (<$asmb>) { s/\r//; $l++; chomp(); if (/^#/) { if (/^#assembly_accession/) { ## Parse column headers s/^#//; @header = split("\t"); for $h (0..$#header) { $column_index{$header[$h]} = $h; } if ($main::verbose >= 3) { print $out "; Column content\n"; for my $f (1..scalar(@header)) { print $out join("\t", ";", $f, $header[$f-1]), "\n"; } } } else { next; ## Skip comment rows } } else { my $pass_filter = 1; my @fields = split("\t"); ################################################################ ## Run filters on user-specified fields for my $field (sort(keys(%filter))) { unless ($fields[$column_index{$field}] =~ /$filter{$field}/) { $pass_filter = 0; last; } } next unless $pass_filter; ################################################################ ## Fix problems with some organism or assembly names that contain ## problematic characters for folder names (brackets, ## parentheses, quotes, ...). my $organism_name = &CleanPathName($fields[$column_index{'organism_name'}]); if ($fields[$column_index{'organism_name'}] ne $organism_name) { &RSAT::message::Warning("Organism name fixed", $fields[$column_index{'organism_name'}], $organism_name) if ($main::verbose >= 5); $fields[$column_index{'organism_name'}] = $organism_name; } my $assembly_accession = &CleanPathName($fields[$column_index{'assembly_accession'}]); if ($fields[$column_index{'assembly_accession'}] ne $assembly_accession) { &RSAT::message::Warning("Assembly accession", $fields[$column_index{'assembly_accession'}], $assembly_accession) if ($main::verbose >= 3); $fields[$column_index{'assembly_accession'}] = $assembly_accession; } my $asm_name = &CleanPathName($fields[$column_index{'asm_name'}]); if ($fields[$column_index{'asm_name'}] ne $asm_name) { &RSAT::message::Warning("Assembly name fixed", $fields[$column_index{'asm_name'}], $asm_name) if ($main::verbose >= 3); $fields[$column_index{'asm_name'}] = $asm_name; } ################################################################ ## Compute some fields by combining extracted field my $current_strain = join("_", $fields[$column_index{'assembly_accession'}], $fields[$column_index{'asm_name'}]); # $current_strain =~s/ /_/g; ## Fix problem with some spaces in asm_name ## Check if current strain corresponds to selected strain if ($strain) { next unless $current_strain eq $strain; } next unless $fields[$column_index{'assembly_accession'}] && $fields[$column_index{'asm_name'}] && $current_strain; ## Split organism_name into genus and species ## ## Note: this assumes that genus name does not contain any ## underscore. Besides , "species" here may include a ## subspecies suffix (e.g. strain name), but we count them ## together as in NCBI. my ($genus, $species) = split("_", $organism_name, 2); my $genus_species = $genus."_".$species; # &RSAT::message::Debug("genus_species : ", $genus_species) if ($main::verbose >= 10); ## Filter on species if (scalar(@species) > 0) { unless ($selected_species{$genus_species}) { next; } } # ## Run filters on user-specified fields # for my $field (sort(keys(%filter))) { # unless ($fields[$column_index{$field}] =~ /$filter{$field}/) { # $pass_filter = 0; # last; # } # } # next unless $pass_filter; ## Create an entry for this assembly in the strain_info hash $selected += 1; for my $f (0..$#fields) { $strain_info{$current_strain}->{$header[$f]} = $fields[$f]; # &RSAT::message::Debug( # $f, # $header[$f], # $strain_info{$current_strain}, # join(";", sort(keys(%strain_info))), # $fields[$f], # $strain_info{$current_strain}->{$header[$f]}, # ) if ($main::verbose >= 10); } # die (join("\t", "BOUM", $current_strain, # $strain_info{$current_strain}, # join("; ", sort(keys(%strain_info)))), # "\n"); ## Define some extra fields $strain_info{$current_strain}->{"n"} = $selected; $strain_info{$current_strain}->{"genus"} = $genus; $strain_info{$current_strain}->{"species"} = $species; $strain_info{$current_strain}->{"genus_species"} = $genus_species; $strain_info{$current_strain}->{"strain"} = $current_strain; $strain_info{$current_strain}->{"organism"} = join("_", $genus, $species, $current_strain); $strain_info{$current_strain}->{"organism"} =~ s/ //g; # die (join("'\t'", "HELLO", $organism_name, $genus, $species, $assembly_accession, $asm_name, $current_strain, $strain_info{$current_strain}->{"organism"}), "\n"); ## Define download directory for this assembly ## Will be used for both tasks download and parse $strain_info{$current_strain}->{"downloads"} = join("/", $ENV{RSAT}, "downloads", "refseq", $group, $strain_info{$current_strain}->{"genus"}, $strain_info{$current_strain}->{"genus_species"}, $current_strain); # &RSAT::message::Debug("genus : ", $genus) if ($main::verbose >= 10); # &RSAT::message::Debug("species : ", $species) if ($main::verbose >= 10); # &RSAT::message::Debug("genus_species : ", $genus_species) if ($main::verbose >= 10); # &RSAT::message::Debug("download dir", $strain_info{$current_strain}->{"downloads"}) if ($main::verbose >= 10); ## For debugging, report the 10 first assemblies if (($main::verbose >= 3) && ($selected <= 10)) { print join("\t", ";", $selected, $l, $strain_info{$current_strain}->{"organism"}, $strain_info{$current_strain}->{"organism_name"}, $strain_info{$current_strain}->{"genus"}, $strain_info{$current_strain}->{"species"}, $strain_info{$current_strain}->{"assembly_accession"}, $strain_info{$current_strain}->{"asm_name"}, $strain_info{$current_strain}->{"genus_species"}, $strain_info{$current_strain}->{"strain"}, ), "\n"; } } } if ($main::verbose >= 1) { &RSAT::message::TimeWarn("Read assembly summaries"); &RSAT::message::Info($l, "lines"); &RSAT::message::Info($selected, "selected assemblies"); } return(%strain_info); } ################################################################ ## Export a table with a subset of the fields for selected organisms sub ListAssemblies { my ($list_format, %strain_info) = @_; my @sorted_strains = sort {$strain_info{$a}->{"organism"} cmp $strain_info{$b}->{"organism"}} keys(%strain_info); # my @sorted_strains = sort {$strain_info{$a}->{"n"} <=> $strain_info{$b}->{"n"}} keys(%strain_info); # my @sorted_strains = sort keys(%strain_info); if ($list_format eq "org") { for my $current_strain (@sorted_strains) { print $out $strain_info{$current_strain}->{"organism"}, "\n" if ($current_strain); } } elsif ($list_format eq "bash") { ## Print a bash file with the instructions to download and install the selected genomes from print $out "#!bash\n"; print $out "TASKS=download,parse,config,default\n"; for my $current_strain (@sorted_strains) { if ($strain_info{$current_strain}->{"genus_species"}) { print $out "install-organism -v 1 -group ", $group, " -species ", $strain_info{$current_strain}->{"genus_species"}, " -strain ", $strain_info{$current_strain}->{"strain"}, " -task \$TASKS", "\n"; } } } else { ## Print a table with selected fields my @fields = qw( organism taxid species_taxid genus_species strain genus species assembly_accession asm_name n ); print $out "#", join("\t", @fields, "n"), "\n"; $n = 0; for my $current_strain (@sorted_strains) { # $n++; my @values = (); for my $field (@fields) { push @values, $strain_info{$current_strain}->{$field}; } # push @values, $n; print $out join("\t", @values), "\n"; } } } ################################################################ ## Suppress empty rows from peptidic sequence files (present in NCBI ## genomes) because the first empty row raises a bug with diamond ## formatdb and blastall. sub CleanProteinSequences { my ($peptidic_seq_file) = @_; if (-e $peptidic_seq_file) { my $empty_rows = `grep '^\$' ${peptidic_seq_file} | wc -l `; $empty_rows =~ s/\s//g; if ($empty_rows > 0) { if ($main::verbose >= 3) { &RSAT::message::Warning("Peptidic sequence file contains ".$empty_rows." empty rows"); &RSAT::message::Info("Cleaning ".$empty_rows." empty rows from peptidic sequence fasta file"); } &doit('perl -pi -e "s/^\n$//" '.$peptidic_seq_file); } } else { &RSAT::message::Warning("Peptidic sequence file is missing", $peptidic_seq_file); } } sub CleanPathName { my ($name) = @_; # suppress problematic characters $name =~ s/ /_/g; $name =~ s/\[//g; $name =~ s/\]//g; $name =~ s/\'//g; $name =~ s/\"//g; $name =~ s/\)//g; $name =~ s/\(//g; $name =~ s/\///g; return ($name); }