Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --details option for showing alignment details #318

Merged
merged 16 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ jobs:
cd build
make -j3
make install
- name: Install Linux dependencies
if: runner.os == 'Linux'
run: sudo apt-get install samtools
- name: Install macOS dependencies
if: runner.os == 'macOS'
run: brew install samtools
- name: Run
run: tests/run.sh

Expand Down
12 changes: 12 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,15 @@ style guide, but new code should follow it.
should be `ClassName`, `variable_name`, `method_name`, `CONSTANT`.
* The header guard of a file named `xyz.hpp` should be named
`STROBEALIGN_XYZ_HPP`.

## Detailed SAM output (`--details`)

When `--details` or `-d` is provided, the following additional SAM tags are output for
mapped reads.

`na`: Number of NAMs (seeds) found
`nr`: Whether NAM rescue was needed (1 if yes, 0 if not)
`mr`: Number of times mate rescue was attempted (local alignment of a mate to
the expected region given its mate)
`al`: Number of times an attempt was made to extend a seed (by gapped or ungapped alignment)
`ga`: Number of times an attempt was made to extend a seed by gapped alignment
268 changes: 132 additions & 136 deletions src/aln.cpp

Large diffs are not rendered by default.

29 changes: 20 additions & 9 deletions src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "sam.hpp"
#include "aligner.hpp"


struct AlignmentStatistics {
std::chrono::duration<double> tot_read_file{0};
std::chrono::duration<double> tot_construct_strobemers{0};
Expand All @@ -19,12 +20,12 @@ struct AlignmentStatistics {
std::chrono::duration<double> tot_extend{0};
std::chrono::duration<double> tot_write_file{0};

uint64_t n_reads = 0;
uint64_t tot_aligner_calls = 0;
uint64_t tot_rescued = 0;
uint64_t tot_all_tried = 0;
uint64_t did_not_fit = 0;
uint64_t tried_rescue = 0;
uint64_t n_reads{0};
uint64_t tot_aligner_calls{0};
uint64_t tot_rescued{0};
uint64_t tot_all_tried{0};
uint64_t inconsistent_nams{0};
uint64_t nam_rescue{0};

AlignmentStatistics operator+=(const AlignmentStatistics& other) {
this->tot_read_file += other.tot_read_file;
Expand All @@ -39,8 +40,17 @@ struct AlignmentStatistics {
this->tot_aligner_calls += other.tot_aligner_calls;
this->tot_rescued += other.tot_rescued;
this->tot_all_tried += other.tot_all_tried;
this->did_not_fit += other.did_not_fit;
this->tried_rescue += other.tried_rescue;
this->inconsistent_nams += other.inconsistent_nams;
this->nam_rescue += other.nam_rescue;
return *this;
}

AlignmentStatistics operator+=(const Details& details) {
this->nam_rescue += details.nam_rescue;
this->tot_rescued += details.mate_rescue;
this->tot_all_tried += details.tried_alignment;
this->inconsistent_nams += details.nam_inconsistent;

return *this;
}
};
Expand All @@ -53,8 +63,9 @@ struct mapping_params {
int maxTries { 20 };
int rescue_cutoff;
bool is_sam_out { true };
bool cigar_eqx { true };
CigarOps cigar_ops{CigarOps::M};
bool output_unmapped { true };
bool details{false};
};

class i_dist_est {
Expand Down
2 changes: 2 additions & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
args::ValueFlag<std::string> rgid(parser, "ID", "Read group ID", {"rg-id"});
args::ValueFlagList<std::string> rg(parser, "TAG:VALUE", "Add read group metadata to SAM header (can be specified multiple times). Example: SM:samplename", {"rg"});
args::Flag details(parser, "details", "Add debugging details to SAM records", {"details"});

args::ValueFlag<int> N(parser, "INT", "Retain at most INT secondary alignments (is upper bounded by -M and depends on -S) [0]", {'N'});
args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
Expand Down Expand Up @@ -88,6 +89,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
// Input/output
if (o) { opt.output_file_name = args::get(o); opt.write_to_stdout = false; }
if (v) { opt.verbose = true; }
if (details) { opt.details = true; }
if (no_progress) { opt.show_progress = false; }
if (eqx) { opt.cigar_eqx = true; }
if (x) { opt.is_sam_out = false; }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct CommandLineOptions {
std::string output_file_name;
bool write_to_stdout { true };
bool verbose { false };
bool details{false};
bool show_progress { true };
bool cigar_eqx { false };
std::string read_group_id { "" };
Expand Down
9 changes: 5 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,9 @@ int run_strobealign(int argc, char **argv) {
map_param.R = opt.R;
map_param.maxTries = opt.maxTries;
map_param.is_sam_out = opt.is_sam_out;
map_param.cigar_eqx = opt.cigar_eqx;
map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
map_param.output_unmapped = opt.output_unmapped;
map_param.details = opt.details;

log_parameters(index_parameters, map_param, aln_params);
logger.debug() << "Threads: " << opt.n_threads << std::endl;
Expand Down Expand Up @@ -311,9 +312,9 @@ int run_strobealign(int argc, char **argv) {

logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
<< "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
<< "Calls to ksw (rescue mode): " << tot_statistics.tot_rescued << std::endl
<< "Did not fit strobe start site: " << tot_statistics.did_not_fit << std::endl
<< "Tried rescue: " << tot_statistics.tried_rescue << std::endl
<< "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl
<< "Tried NAM rescue: " << tot_statistics.nam_rescue << std::endl
<< "Mates rescued by alignment: " << tot_statistics.tot_rescued << std::endl
<< "Total time mapping: " << map_align_timer.elapsed() << " s." << std::endl
<< "Total time reading read-file(s): " << tot_statistics.tot_read_file.count() / opt.n_threads << " s." << std::endl
<< "Total time creating strobemers: " << tot_statistics.tot_construct_strobemers.count() / opt.n_threads << " s." << std::endl
Expand Down
2 changes: 1 addition & 1 deletion src/nam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,6 @@ std::vector<Nam> find_nams_rescue(
}

std::ostream& operator<<(std::ostream& os, const Nam& n) {
os << "Nam(query: " << n.query_s << ".." << n.query_e << ", ref: " << n.ref_s << ".." << n.ref_e << ", score=" << n.score << ")";
os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_s << ".." << n.query_e << ", ref: " << n.ref_s << ".." << n.ref_e << ", score=" << n.score << ")";
return os;
}
23 changes: 11 additions & 12 deletions src/pc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,12 @@ void distribute_interleaved(
}
}


size_t InputBuffer::read_records(std::vector<klibpp::KSeq> &records1,
std::vector<klibpp::KSeq> &records2,
std::vector<klibpp::KSeq> &records3,
AlignmentStatistics &statistics,
int to_read) {
Timer timer;
size_t InputBuffer::read_records(
std::vector<klibpp::KSeq> &records1,
std::vector<klibpp::KSeq> &records2,
std::vector<klibpp::KSeq> &records3,
int to_read
) {
records1.clear();
records2.clear();
records3.clear();
Expand All @@ -97,7 +96,6 @@ size_t InputBuffer::read_records(std::vector<klibpp::KSeq> &records1,
}

unique_lock.unlock();
statistics.tot_read_file += timer.duration();

return current_chunk_index;
}
Expand Down Expand Up @@ -149,9 +147,10 @@ void perform_task(
std::vector<klibpp::KSeq> records1;
std::vector<klibpp::KSeq> records2;
std::vector<klibpp::KSeq> records3;
auto chunk_index = input_buffer.read_records(records1, records2, records3, statistics);
Timer timer;
auto chunk_index = input_buffer.read_records(records1, records2, records3);
statistics.tot_read_file += timer.duration();
assert(records1.size() == records2.size());
i_dist_est isize_est;
if (records1.empty()
&& records3.empty()
&& input_buffer.finished_reading){
Expand All @@ -160,8 +159,8 @@ void perform_task(

std::string sam_out;
sam_out.reserve(7*map_param.r * (records1.size() + records3.size()));
CigarOps cigar_ops = map_param.cigar_eqx ? CigarOps::EQX : CigarOps::M;
Sam sam{sam_out, references, cigar_ops, read_group_id, map_param.output_unmapped};
Sam sam{sam_out, references, map_param.cigar_ops, read_group_id, map_param.output_unmapped, map_param.details};
i_dist_est isize_est;
for (size_t i = 0; i < records1.size(); ++i) {
auto record1 = records1[i];
auto record2 = records2[i];
Expand Down
11 changes: 6 additions & 5 deletions src/pc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ class InputBuffer {
bool is_interleaved{false};

void rewind_reset();
size_t read_records(std::vector<klibpp::KSeq> &records1,
std::vector<klibpp::KSeq> &records2,
std::vector<klibpp::KSeq> &records3,
AlignmentStatistics &statistics,
int read_count=-1);
size_t read_records(
std::vector<klibpp::KSeq> &records1,
std::vector<klibpp::KSeq> &records2,
std::vector<klibpp::KSeq> &records3,
int read_count=-1
);
};


Expand Down
3 changes: 1 addition & 2 deletions src/readlen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ int estimate_read_length(InputBuffer& input_buffer) {
std::vector<klibpp::KSeq> records1;
std::vector<klibpp::KSeq> records2;
std::vector<klibpp::KSeq> records3;
AlignmentStatistics stats;
input_buffer.read_records(records1, records2, records3, stats, 500);
input_buffer.read_records(records1, records2, records3, 500);
if (records1.empty() && records3.empty()) {
return 150;
}
Expand Down
48 changes: 37 additions & 11 deletions src/sam.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "sam.hpp"
#include <algorithm>
#include <ostream>
#include <sstream>

#define SAM_UNMAPPED_MAPQ 0
#define SAM_UNMAPPED_MAPQ_STRING "0"
Expand Down Expand Up @@ -38,10 +39,25 @@ std::string strip_suffix(const std::string& name) {
}
}

void Sam::append_tail() {
void Sam::append_rg_and_newline() {
sam_string.append(tail);
}

void Sam::append_details(const Details& details) {
std::stringstream s;
s << "\tna:i:" << details.nams;
s << "\tnr:i:" << details.nam_rescue;
s << "\tal:i:" << details.tried_alignment;
s << "\tga:i:" << details.gapped;
sam_string.append(s.str());
}

void Sam::append_paired_details(const Details& details) {
std::stringstream s;
s << "\tmr:i:" << details.mate_rescue;
sam_string.append(s.str());
}

std::string Sam::cigar_string(const Cigar& cigar) const {
if (cigar.empty()) {
// This case should normally not occur because
Expand All @@ -68,7 +84,7 @@ void Sam::add_unmapped(const KSeq& record, int flags) {
sam_string.append("\t*\t0\t" SAM_UNMAPPED_MAPQ_STRING "\t*\t*\t0\t0\t");
sam_string.append(record.seq);
append_qual(record.qual);
append_tail();
append_rg_and_newline();
}

void Sam::add_unmapped_mate(const KSeq& record, int flags, const std::string& mate_reference_name, int mate_pos) {
Expand All @@ -90,7 +106,7 @@ void Sam::add_unmapped_mate(const KSeq& record, int flags, const std::string& ma
sam_string.append("\t0\t");
sam_string.append(record.seq);
append_qual(record.qual);
append_tail();
append_rg_and_newline();
}

void Sam::add_unmapped_pair(const KSeq& r1, const KSeq& r2) {
Expand All @@ -103,17 +119,18 @@ void Sam::add(
const Alignment& sam_aln,
const KSeq& record,
const std::string& sequence_rc,
bool is_secondary
bool is_primary,
const Details& details
) {
assert(!sam_aln.is_unaligned);
int flags = 0;
if (!sam_aln.is_unaligned && sam_aln.is_rc) {
flags |= REVERSE;
}
if (is_secondary) {
if (!is_primary) {
flags |= SECONDARY;
}
add_record(record.name, flags, references.names[sam_aln.ref_id], sam_aln.ref_start, sam_aln.mapq, sam_aln.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, sam_aln.ed, sam_aln.aln_score);
add_record(record.name, flags, references.names[sam_aln.ref_id], sam_aln.ref_start, sam_aln.mapq, sam_aln.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, sam_aln.ed, sam_aln.aln_score, details);
}

// Add one individual record
Expand All @@ -131,7 +148,8 @@ void Sam::add_record(
const std::string& query_sequence_rc,
const std::string& qual,
int ed,
int aln_score
int aln_score,
const Details& details
) {
sam_string.append(strip_suffix(query_name));
sam_string.append("\t");
Expand Down Expand Up @@ -180,7 +198,14 @@ void Sam::add_record(
} else {
append_qual(qual);
}
append_tail();

if (show_details) {
append_details(details);
if (flags & PAIRED) {
append_paired_details(details);
}
}
append_rg_and_newline();
}

void Sam::add_pair(
Expand All @@ -193,7 +218,8 @@ void Sam::add_pair(
int mapq1,
int mapq2,
bool is_proper,
bool is_primary
bool is_primary,
const std::array<Details, 2>& details
) {
int f1 = PAIRED | READ1;
int f2 = PAIRED | READ2;
Expand Down Expand Up @@ -273,12 +299,12 @@ void Sam::add_pair(
if (sam_aln1.is_unaligned) {
add_unmapped_mate(record1, f1, reference_name2, pos2);
} else {
add_record(record1.name, f1, reference_name1, sam_aln1.ref_start, mapq1, sam_aln1.cigar, mate_reference_name2, pos2, template_len1, record1.seq, read1_rc, record1.qual, edit_distance1, sam_aln1.aln_score);
add_record(record1.name, f1, reference_name1, sam_aln1.ref_start, mapq1, sam_aln1.cigar, mate_reference_name2, pos2, template_len1, record1.seq, read1_rc, record1.qual, edit_distance1, sam_aln1.aln_score, details[0]);
}
if (sam_aln2.is_unaligned) {
add_unmapped_mate(record2, f2, reference_name1, pos1);
} else {
add_record(record2.name, f2, reference_name2, sam_aln2.ref_start, mapq2, sam_aln2.cigar, mate_reference_name1, pos1, -template_len1, record2.seq, read2_rc, record2.qual, edit_distance2, sam_aln2.aln_score);
add_record(record2.name, f2, reference_name2, sam_aln2.ref_start, mapq2, sam_aln2.cigar, mate_reference_name1, pos1, -template_len1, record2.seq, read2_rc, record2.qual, edit_distance2, sam_aln2.aln_score, details[1]);
}
}

Expand Down
Loading