#!/usr/bin/env perl # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute # Copyright [2016-2019] EMBL-European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. use strict; use warnings; use Getopt::Long; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Compara::GenomicAlign; use Bio::EnsEMBL::Compara::GenomicAlignBlock; my %genomic_aligns; my %genomic_align_blocks; my %r_components; my %r_overall; my $atac_in; my $reg_conf; my $mlss_id; my $r_gdbid; my $q_gdbid; my $starting_group_id; my $nodb; my $logfile; my $help; GetOptions("atac=s" => \$atac_in, "reg-conf=s" => \$reg_conf, "mlss=i" => \$mlss_id, "r-gdb-id=i" => \$r_gdbid, "q-gdb-id=i" => \$q_gdbid, "starting-group-id=i" => \$starting_group_id, "nodb" => \$nodb, "logfile=s" => \$logfile, "help" => \$help); if ($help || !$atac_in || !$reg_conf || !$mlss_id || !$r_gdbid || !$q_gdbid || !$starting_group_id) { die usage(); } open ATAC, $atac_in or die "couldn't open ATAC file $atac_in $!\n"; if ($logfile) { open LOGFILE, ">$logfile" or die "couldn't open logfile $logfile $!\n"; select((select(LOGFILE), $| = 1)[0]); #make LOGFILE unbuffered (a.k.a. "hot"). #See http://perl.plover.com/FAQs/Buffering.html } print LOGFILE "setting up adaptors\n" if $logfile; my $reg = "Bio::EnsEMBL::Registry"; $reg->load_all($reg_conf); my $dfa = $reg->get_adaptor("Multi", "compara", "DnaFrag"); my $gaa = $reg->get_adaptor("Multi", "compara", "GenomicAlign"); my $gaba = $reg->get_adaptor("Multi", "compara", "GenomicAlignBlock"); my $mlssa = $reg->get_adaptor("Multi", "compara", "MethodLinkSpeciesSet"); my $mlss_o = $mlssa->fetch_by_dbID($mlss_id); print LOGFILE "fetched mlss " . $mlss_o->dbID() . "\n" if $logfile; validate_mlss_and_species($r_gdbid, $q_gdbid, $mlss_o) or die "mlss $mlss_id does not link genome_dbs $r_gdbid and $q_gdbid $!\n"; my $next_ga_id = ($mlss_id * 10**10) + 1; my $next_gab_id = $next_ga_id; print LOGFILE "starting genomic align [block] ID will be $next_ga_id\n"; print LOGFILE "starting to read atac file\n" if $logfile; my $atac_lines = 1; while () { next unless (/^M/); chomp; my @line = split; #ATAC coords are 0 based. if ($line[1] eq 'u') { my $ungapped_hit = {'r_id' => $line[4], 'r_start' => $line[5] + 1, 'r_length' => $line[6], 'r_strand' => $line[7], 'q_id' => $line[8], 'q_start' => $line[9] + 1, 'q_length' => $line[10], 'q_strand' => $line[11]}; push(@{$r_components{$line[3]}}, $ungapped_hit); if ($logfile && (($atac_lines % 10000) == 0)) { print LOGFILE "read $atac_lines atac lines\n"; } $atac_lines++; } elsif ($line[1] eq 'r') { $r_overall{$line[2]} = {'r_id' => $line[4], 'r_start' => $line[5] + 1, 'r_length' => $line[6], 'r_strand' => $line[7], 'q_id' => $line[8], 'q_start' => $line[9] + 1, 'q_length' => $line[10], 'q_strand' => $line[11]}; if ($logfile && (($atac_lines % 10000) == 0)) { print LOGFILE "read $atac_lines atac lines\n"; } $atac_lines++; } else { next; } } print LOGFILE "processing hsps\n" if $logfile; my $group_id = $starting_group_id; foreach my $run_id (keys(%r_components)) { my $ref_dnafrag = $dfa->fetch_by_GenomeDB_and_name($r_gdbid, $r_overall{$run_id}->{r_id}); my $query_dnafrag = $dfa->fetch_by_GenomeDB_and_name($q_gdbid, $r_overall{$run_id}->{q_id}); die "could not find ref dnafrag for " . $r_overall{$run_id}->{r_id} . "\n" unless $ref_dnafrag; die "could not find query dnafrag for " . $r_overall{$run_id}->{q_id} . "\n" unless $query_dnafrag; my @run_components = @{$r_components{$run_id}}; @run_components = sort {$a->{'r_start'} <=> $b->{'r_start'}} @run_components; foreach my $run_component (@run_components) { my $ref_ga = new Bio::EnsEMBL::Compara::GenomicAlign (-adaptor => $gaa, -method_link_species_set => $mlss_o, -dnafrag => $ref_dnafrag, -dnafrag_start => $run_component->{r_start}, -dnafrag_end => $run_component->{r_start} + ($run_component->{r_length} - 1), -dnafrag_strand => $run_component->{r_strand}, -cigar_line => $run_component->{r_length} . "M", -dbID => $next_ga_id, -visible => 1); $next_ga_id++; my $query_ga = new Bio::EnsEMBL::Compara::GenomicAlign (-adaptor => $gaa, -method_link_species_set => $mlss_o, -dnafrag => $query_dnafrag, -dnafrag_start => $run_component->{q_start}, -dnafrag_end => $run_component->{q_start} + ($run_component->{q_length} - 1), -dnafrag_strand => $run_component->{q_strand}, -cigar_line => $run_component->{q_length} . "M", -dbID => $next_ga_id, -visible => 1); $next_ga_id++; my $new_gab = new Bio::EnsEMBL::Compara::GenomicAlignBlock (-adaptor => $gaba, -method_link_species_set => $mlss_o, -length => $run_component->{q_length}, -genomic_align_array => [$ref_ga, $query_ga], -group_id => $group_id, -dbID => $next_gab_id ); $next_gab_id++; $gaba->store($new_gab) unless $nodb; } if ($logfile) { if ((($group_id - $starting_group_id + 1) % 1000) == 0) { print LOGFILE "processed " . ($group_id - $starting_group_id + 1) . " hsps\n"; } } $group_id++; } sub validate_mlss_and_species { my ($r_gdbid, $q_gdbid, $mlss_o) = @_; my @given_gdbids = ($r_gdbid, $q_gdbid); @given_gdbids = sort {$a <=> $b} @given_gdbids; my $ss_gdbs = $mlss_o->species_set()->genome_dbs(); my @ss_gdbids = map {$_->dbID()} @{$ss_gdbs}; #assume if there is one gdb_id in the species set that this #is a triticum self alignment, so duplicate the id if (scalar(@ss_gdbids) == 1) { push(@ss_gdbids, $ss_gdbids[0]); } @ss_gdbids = sort {$a <=> $b} @ss_gdbids; return ((join(",", @given_gdbids)) eq (join(",", @ss_gdbids))); } sub usage { my $usagestr = <<'END_USAGE'; usage: atac-parser.pl [options] required options: --atac filename file containing ATAC output --reg-conf reg_conf_file registry file pointing to the compara database to load to --r-gdb-id genome_db_id genome_db_id of reference genome (second set of columns in standard ATAC output) --q-gdb-id genome_db_id genome_db_id of query genome (first set of columns in standard ATAC output) --mlss mlss_id mlss_id for storing alignments --starting-group-id int new group_id number to start with (for genomic_align_blocks) optional options: --logfile filename log of script progress --nodb if set, do not write to database END_USAGE print STDERR "$usagestr\n"; }