libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.cpp
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 *
19 * This program is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 * You should have received a copy of the GNU General Public License
30 * along with this program. If not, see <http://www.gnu.org/licenses/>.
31 */
32
33#include <QString>
34#include <QStringRef>
35#include <algorithm>
36#include "semiglobalalignment.h"
39#include "correctiontree.h"
42
43void
45{
46 peaks.clear();
47 m_peptideModel.reset();
48 score = 0;
49 begin_shift = 0.0;
50 end_shift = 0.0;
51 shifts.clear();
52 SPC = 0;
53 beginning = 0;
54 end = 0;
55}
56
57
58QString
59pappso::specpeptidoms::Alignment::getPeptideString(const QString &protein_sequence) const
60{
61 return protein_sequence.mid(beginning, end - beginning);
62}
63
64double
66{
67 double sum_of_elems = std::accumulate(shifts.begin(), shifts.end(), 0);
68 return begin_shift + sum_of_elems + end_shift;
69}
70
71
72std::size_t
77
78
80 const ScoreValues &score_values,
81 const pappso::PrecisionPtr precision_ptr,
82 const pappso::AaCode &aaCode)
83 : m_scorevalues(score_values), m_aaCode(aaCode)
84{
85 m_precision_ptr = precision_ptr;
86
87 KeyCell key_cell_init_first;
88 key_cell_init_first.n_row = 0;
89 key_cell_init_first.score = 0;
90 key_cell_init_first.beginning = 0;
91 key_cell_init_first.tree_id = 0;
92 m_interest_cells.push_back(key_cell_init_first);
93}
94
95const std::vector<pappso::specpeptidoms::KeyCell> &
100
101void
103 const SpOMSProtein *protein_ptr)
104{
105 std::size_t sequence_length = protein_ptr->size();
106
107 initFastAlign(spectrum);
108
109 for(std::size_t row_number = 1; row_number <= sequence_length; row_number++)
110 {
111 updateAlignmentMatrix(*protein_ptr,
112 row_number,
113 spectrum.getAaPositions(protein_ptr->at(row_number - 1).code),
114 spectrum,
115 true,
116 protein_ptr);
117 }
118}
119
120void
122{
123 // m_scenario.clear();
124 // TODO don't forget to reset any important variable
125 // m_best_alignment.reset();
126 // m_best_corrected_alignment.reset();
127 // m_best_post_processed_alignment.reset();
128
129 KeyCell key_cell_init;
130 key_cell_init.n_row = 0;
131 key_cell_init.score = m_scorevalues.get(ScoreType::init);
132 key_cell_init.beginning = 0;
133 key_cell_init.tree_id = 0;
134
135 m_interest_cells.resize(spectrum.size());
136 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
137
138 m_interest_cells.at(0).score = 0;
139
140 // m_location_saver.resetLocationSaver();
141 m_updated_cells.clear();
142}
143
144void
146 const SpOMSProtein *protein_ptr,
147 const std::size_t beginning,
148 const std::size_t length)
149{
150 try
151 {
152 qDebug();
153 const QString &protein_seq = protein_ptr->getSequence();
154 std::size_t length2;
155 if((qsizetype)(beginning + length) <= protein_seq.size())
156 {
157 length2 = length;
158 }
159 else
160 {
161 length2 = protein_seq.size() - beginning;
162 }
163
164 qDebug();
165 QString sequence_str = protein_seq.sliced(protein_seq.size() - beginning - length2, length2);
166
167 SpOMSProtein sequence("sub_sequence", sequence_str, m_aaCode);
168
169 // std::reverse(sequence.begin(), sequence.end());
170 std::vector<AaPosition> aa_positions;
171 CorrectionTree correction_tree;
172
173 qDebug();
174 m_scenario.reserve(length2 + 1, spectrum.size());
175 m_interest_cells.reserve(spectrum.size());
176 m_interest_cells.at(0).n_row = 0;
177 m_interest_cells.at(0).score = 0;
178 m_interest_cells.at(0).beginning = 0;
179 m_interest_cells.at(0).tree_id = 0;
180 for(std::size_t i = 1; i < m_interest_cells.size(); i++)
181 {
182 m_interest_cells.at(i).n_row = 0;
184 m_interest_cells.at(i).beginning = 0;
185 m_interest_cells.at(i).tree_id = 0;
186 }
187 qDebug();
188 for(std::size_t iter = m_interest_cells.size(); iter < spectrum.size(); iter++)
189 {
190 m_interest_cells.push_back({0, m_scorevalues.get(ScoreType::init), 0, 0});
191 }
192 qDebug();
193 m_scenario.resetScenario();
194 qDebug();
195 for(std::size_t row_number = 1; row_number <= length2; row_number++)
196 {
197
198 qDebug() << "row_number - 1=" << row_number - 1 << " sequence.size()=" << sequence.size();
199 // aa = Aa(sequence[row_number - 1].unicode());
200 updateAlignmentMatrix(sequence,
201 row_number,
202 spectrum.getAaPositions(sequence[row_number - 1].code),
203 spectrum,
204 false,
205 protein_ptr);
206 }
207 qDebug();
208 saveBestAlignment(sequence, spectrum, protein_seq.size() - beginning);
209
210 qDebug() << m_scenario.getBestScore() << " " << MIN_ALIGNMENT_SCORE;
211 // Correction : if complementary peaks are used, corrected spectra without one of the two
212 // peaks are generated and aligned. The best alignment is kept.
213 if(m_scenario.getBestScore() >
214 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
215 {
216 // we only correct alignment if the sequence has a minimum amino acid diversity
217 if(checkSequenceDiversity(sequence.getSequence(), 5, 2))
218 {
219
220 qDebug();
222 for(std::size_t iter : m_best_alignment.peaks)
223 {
224 if(iter > spectrum.getComplementaryPeak(iter))
225 {
226 break;
227 }
228 else if(std::find(m_best_alignment.peaks.begin(),
229 m_best_alignment.peaks.end(),
230 spectrum.getComplementaryPeak(iter)) !=
231 m_best_alignment.peaks.end())
232 {
233 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
234 }
235 }
236 qDebug();
237 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
238 if(corrections.size() > 0)
239 {
240 m_best_alignment.score =
241 0; // Reset the best alignment score (we dont want to keep
242 // the original alignment if corrections are needed)
243 qDebug();
244 for(auto peaks_to_remove : corrections)
245 {
246 qDebug();
247 correctAlign(sequence,
248 protein_ptr,
249 spectrum,
250 peaks_to_remove,
251 protein_seq.size() - beginning);
252 qDebug();
253 }
254 qDebug();
256 }
257 }
258 }
259 else
260 {
261 // this sequence has too much redundancy
262 // we have to lower the score
263 m_best_alignment.score = 0;
264 }
265 qDebug();
266 }
267 catch(const pappso::PappsoException &err)
268 {
270 QObject::tr("SemiGlobalAlignment::preciseAlign failed :\n%1").arg(err.qwhat()));
271 }
272}
273
274void
276 const SpOMSProtein *protein_ptr,
277 const SpOMSSpectrum &spectrum,
278 std::vector<std::size_t> &peaks_to_remove,
279 std::size_t offset)
280{
281 std::vector<AaPosition> aa_positions;
282 CorrectionTree correction_tree;
283 std::vector<std::size_t> final_peaks_to_remove;
284
285 KeyCell key_cell_init;
286 key_cell_init.beginning = 0;
287 key_cell_init.n_row = 0;
288 key_cell_init.score = m_scorevalues.get(ScoreType::init);
289 key_cell_init.tree_id = 0;
290
291 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
292
293 m_interest_cells.at(0).score = 0;
294
295 m_scenario.resetScenario();
296 qDebug();
297 for(qsizetype row_number = 1; row_number <= sequence.size(); row_number++)
298 {
299 qDebug() << row_number - 1 << " " << sequence.size();
300 qDebug() << "sequence[row_number - 1].aa" << (char)sequence[row_number - 1].aa;
301 qDebug();
302 aa_positions = spectrum.getAaPositions(sequence[row_number - 1].code, peaks_to_remove);
303 qDebug();
304 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, false, protein_ptr);
305 qDebug();
306 }
307
308 qDebug();
309 // Correction : if complementary peaks are used, corrected spectra without one of the two peaks
310 // are generated and aligned. The best alignment is kept.
311 qDebug() << m_scenario.getBestScore();
312 if(m_scenario.getBestScore() >
313 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
314 {
315 qDebug();
316 qDebug() << sequence.getSequence();
317 qDebug() << offset;
318 qDebug() << spectrum.getPrecursorCharge();
319 saveBestAlignment(sequence, spectrum, offset);
320 qDebug();
321 for(std::size_t iter : m_best_alignment.peaks)
322 {
323 qDebug() << "iter:" << iter << "comp:" << spectrum.getComplementaryPeak(iter);
324 if(iter == spectrum.getComplementaryPeak(iter))
325 {
326 continue;
327 }
328 else if(iter > spectrum.getComplementaryPeak(iter))
329 {
330 break;
331 }
332 else if(std::find(m_best_alignment.peaks.begin(),
333 m_best_alignment.peaks.end(),
334 spectrum.getComplementaryPeak(iter)) != m_best_alignment.peaks.end())
335 {
336 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
337 }
338 }
339 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
340 if(corrections.size() > 0)
341 {
342 for(auto new_peaks_to_remove : corrections)
343 {
344 final_peaks_to_remove = std::vector<std::size_t>(new_peaks_to_remove);
345 final_peaks_to_remove.insert(
346 final_peaks_to_remove.end(), peaks_to_remove.begin(), peaks_to_remove.end());
347 correctAlign(sequence, protein_ptr, spectrum, final_peaks_to_remove, offset);
348 }
349 }
350 else if(m_scenario.getBestScore() > m_best_corrected_alignment.score)
351 {
353 }
354 }
355 qDebug();
356}
357
358void
360 const SpOMSProtein *protein_ptr,
361 std::size_t beginning,
362 std::size_t length,
363 const std::vector<double> &shifts)
364{
365 std::size_t current_SPC = m_best_alignment.SPC;
366 int current_best_score = m_best_alignment.score;
368 for(double precursor_mass_error : shifts)
369 {
370 SpOMSSpectrum corrected_spectrum(spectrum, precursor_mass_error);
371 preciseAlign(corrected_spectrum, protein_ptr, beginning, length);
373 {
375 }
376 }
377 if(m_best_post_processed_alignment.SPC > current_SPC &&
378 m_best_post_processed_alignment.score >= current_best_score)
379 {
381 }
382}
383
384void
387 const std::size_t row_number,
388 const std::vector<AaPosition> &aa_positions,
389 const SpOMSSpectrum &spectrum,
390 const bool fast_align,
391 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
392{
393 int where = 0;
394 try
395 {
396 int score_found, score_shift, best_score, alt_score, tree_id;
397 uint32_t condition; // FIXME : may be used uninitialised
398 std::size_t best_column, shift, beginning, missing_aas, length, perfect_shift_origin;
399 KeyCell *current_cell_ptr, *tested_cell_ptr;
400 AlignType alignment_type, temp_align_type;
401
402 double smallest_aa_mass = m_aaCode.getMass((std::uint8_t)1);
403
404 m_updated_cells.reserve(aa_positions.size());
405 where = 1;
406 // Computation of the threePeaks condition, see spomsspectrum.h for more details.
407 if(fast_align)
408 {
409 condition = 3;
410 if(row_number > 1)
411 {
412 qDebug() << (char)sequence.at(row_number - 2).aa;
413 qDebug() << "condition" << condition;
414 condition += 2 << sequence.at(row_number - 2).code;
415 qDebug();
416 qDebug() << "condition" << condition;
417 }
418 }
419 where = 2;
420 for(std::vector<AaPosition>::const_iterator aa_position = aa_positions.begin();
421 aa_position != aa_positions.end();
422 aa_position++)
423 {
424
425 where = 3;
426 if(((condition & aa_position->condition) != 0) ||
427 !fast_align) // Verification of the threePeaks condition (only during first alignment).
428 {
429 current_cell_ptr = &m_interest_cells.at(aa_position->r_peak);
430 if(spectrum.peakType(aa_position->r_peak) ==
432 {
433 score_found = m_scorevalues.get(ScoreType::foundDouble);
435 }
436 else
437 {
438 score_found = m_scorevalues.get(ScoreType::found);
439 score_shift = m_scorevalues.get(ScoreType::foundShift);
440 }
441
442 // not found case (always computed)
443 best_column = aa_position->r_peak;
444 best_score = current_cell_ptr->score + (row_number - current_cell_ptr->n_row) *
446 beginning = current_cell_ptr->beginning;
447 tree_id = current_cell_ptr->tree_id;
448 alignment_type = AlignType::notFound;
449
450 // found case (Can only happen if the left peak is supported)
451 if(aa_position->l_support)
452 {
453 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak);
454 if(aa_position->l_peak == 0)
455 {
456 alt_score = tested_cell_ptr->score + score_found;
457 }
458 else
459 {
460 if(tested_cell_ptr->n_row == row_number - 1)
461 {
462 alt_score = tested_cell_ptr->score +
463 (row_number - tested_cell_ptr->n_row - 1) *
465 score_found;
466 }
467 else
468 {
469 alt_score = tested_cell_ptr->score +
470 (row_number - tested_cell_ptr->n_row - 1) *
472 score_shift;
473 }
474 }
475 if(alt_score >= best_score)
476 {
477 alignment_type = AlignType::found;
478 best_score = alt_score;
479 best_column = aa_position->l_peak;
480 if(best_column == 0)
481 {
482 if(row_number < ALIGNMENT_SURPLUS)
483 {
484 beginning = 0;
485 }
486 else
487 {
488 beginning = std::max((std::size_t)(row_number - ALIGNMENT_SURPLUS),
489 (std::size_t)0);
490 }
491 if(fast_align)
492 {
493 tree_id = m_location_saver.getNextTree();
494 }
495 }
496 else
497 {
498 beginning = tested_cell_ptr->beginning;
499 tree_id = tested_cell_ptr->tree_id;
500 }
501 }
502 }
503
504 where = 4;
505 // generic shift case (all shifts are tested)
506 if(aa_position->l_support)
507 {
508 shift = 1;
509 }
510 else
511 {
512 shift = 0;
513 }
514 while(shift < aa_position->l_peak)
515 {
516 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak - shift);
517 // verification saut parfait
518 if(perfectShiftPossible(sequence,
519 spectrum,
520 tested_cell_ptr->n_row,
521 row_number,
522 aa_position->l_peak - shift,
523 aa_position->r_peak) &&
525 {
526 alt_score = tested_cell_ptr->score +
527 (row_number - tested_cell_ptr->n_row - 1) *
529 score_found;
530 temp_align_type = AlignType::perfectShift;
531 }
532 else
533 {
534 alt_score = tested_cell_ptr->score +
535 (row_number - tested_cell_ptr->n_row - 1) *
537 score_shift;
538 temp_align_type = AlignType::shift;
539 }
540 if(alt_score > best_score)
541 {
542 alignment_type = temp_align_type;
543 best_score = alt_score;
544 best_column = aa_position->l_peak - shift;
545 beginning = tested_cell_ptr->beginning;
546 tree_id = tested_cell_ptr->tree_id;
547 }
548 shift++;
549 }
550
551 where = 5;
552 // case shift from column 0 (no penalties if all precedent amino acids are missed)
553 tested_cell_ptr = &m_interest_cells.at(0);
554 // verification saut parfait
555 if(aa_position->r_peak <= TOL_PEAKS_MISSING_FIRST_COLUMN)
556 {
557 perfect_shift_origin =
558 perfectShiftPossibleFrom0(sequence, spectrum, row_number, aa_position->r_peak);
559 }
560 else
561 {
562 perfect_shift_origin = row_number;
563 }
564
565 if(perfect_shift_origin != row_number)
566 {
567 alt_score = tested_cell_ptr->score + score_found;
568 temp_align_type = AlignType::perfectShift;
569 }
570 else
571 {
572 alt_score = tested_cell_ptr->score + score_shift;
573 temp_align_type = AlignType::shift;
574 }
575
576 where = 6;
577 if(alt_score > best_score)
578 {
579 alignment_type = temp_align_type;
580 best_score = alt_score;
581 best_column = 0;
582 missing_aas =
583 std::floor(spectrum.getMZShift(0, aa_position->l_peak) / smallest_aa_mass);
584 if(row_number < ALIGNMENT_SURPLUS + missing_aas)
585 {
586 beginning = 0;
587 }
588 else
589 {
590 beginning =
591 std::max((std::size_t)(row_number - missing_aas - ALIGNMENT_SURPLUS),
592 (std::size_t)0);
593 }
594 where = 7;
595 if(fast_align)
596 {
597 tree_id = m_location_saver.getNextTree();
598 }
599 }
600
601 where = 8;
602 if(best_column != aa_position->r_peak)
603 {
604 m_updated_cells.push_back(
605 {aa_position->r_peak, {row_number, best_score, beginning, tree_id}});
606 }
607
608 where = 9;
609 if(best_score > m_location_saver.getMinScore(tree_id) && fast_align)
610 {
611 length =
612 row_number - beginning + 1 +
613 std::ceil(spectrum.getMissingMass(aa_position->r_peak) / smallest_aa_mass) +
615 where = 10;
616 m_location_saver.addLocation(beginning, length, tree_id, best_score, protein_ptr);
617 }
618 else if(!fast_align)
619 {
620
621 where = 11;
622 if(alignment_type == AlignType::perfectShift && best_column == 0)
623 {
624 m_scenario.saveOrigin(row_number,
625 aa_position->r_peak,
626 perfect_shift_origin,
627 0,
628 best_score,
630 }
631 else
632 {
633 m_scenario.saveOrigin(row_number,
634 aa_position->r_peak,
635 m_interest_cells.at(best_column).n_row,
636 best_column,
637 best_score,
638 alignment_type);
639 }
640 }
641 }
642 }
643
644 where = 30;
645 // Update row number in column 0
646 m_updated_cells.push_back({0, {row_number, 0, 0, 0}});
647
648 // Save updated key cells in the matrix
649 while(m_updated_cells.size() > 0)
650 {
651 qDebug() << m_interest_cells.size() << " " << m_updated_cells.back().first;
652 m_interest_cells.at(m_updated_cells.back().first) = m_updated_cells.back().second;
653 m_updated_cells.pop_back();
654 }
655 where++;
656 }
657 catch(const std::exception &error)
658 {
660 QObject::tr("updateAlignmentMatrix failed std::exception :\n%1 %2")
661 .arg(where)
662 .arg(error.what()));
663 }
664 catch(const pappso::PappsoException &err)
665 {
667 QObject::tr("updateAlignmentMatrix failed :\n%1").arg(err.qwhat()));
668 }
669}
670
671bool
674 const SpOMSSpectrum &spectrum,
675 const std::size_t origin_row,
676 const std::size_t current_row,
677 const std::size_t l_peak,
678 const std::size_t r_peak) const
679{
680 try
681 {
682 double missing_mass = 0;
683 auto it_end = sequence.begin() + current_row;
684 for(auto iter = sequence.begin() + origin_row; (iter != it_end) && (iter != sequence.end());
685 iter++)
686 {
687 missing_mass += iter->mass; // Aa(iter->unicode()).getMass();
688 }
689 if(missing_mass > 0)
690 {
691 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
692 return mz_range.contains(spectrum.getMZShift(l_peak, r_peak));
693 }
694 else
695 {
696 return false;
697 }
698 }
699 catch(const std::exception &error)
700 {
702 QObject::tr("perfectShiftPossible failed std exception:\n%1").arg(error.what()));
703 }
704 catch(const pappso::PappsoException &err)
705 {
707 QObject::tr("perfectShiftPossible failed :\n%1").arg(err.qwhat()));
708 }
709}
710
711std::size_t
714 const SpOMSSpectrum &spectrum,
715 const std::size_t current_row,
716 const std::size_t r_peak) const
717{
718 std::size_t perfect_shift_origin = current_row;
719 double missing_mass = spectrum.getMZShift(0, r_peak);
720 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
721 double aa_mass = 0;
722 while(aa_mass < missing_mass && perfect_shift_origin > 0 && !mz_range.contains(aa_mass))
723 {
724 aa_mass += sequence.at(perfect_shift_origin - 1)
725 .mass; // Aa(sequence.at(perfect_shift_origin - 1).unicode()).getMass();
726 perfect_shift_origin--;
727 }
728 if(mz_range.contains(aa_mass))
729 {
730 return perfect_shift_origin;
731 }
732 else
733 {
734 return current_row;
735 }
736}
737
738std::size_t
741 const SpOMSSpectrum &spectrum,
742 std::size_t end_row,
743 std::size_t end_peak) const
744{
745 try
746 {
747 std::size_t perfect_shift_end = end_row + 1;
748 double missing_mass = spectrum.getMissingMass(end_peak);
749 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
750 double aa_mass = 0;
751 while(aa_mass < missing_mass && perfect_shift_end < (std::size_t)sequence.size() &&
752 !mz_range.contains(aa_mass))
753 {
754 aa_mass += sequence.at(perfect_shift_end - 1)
755 .mass; // Aa(sequence.at(perfect_shift_end - 1).unicode()).getMass();
756 perfect_shift_end++;
757 }
758 if(mz_range.contains(aa_mass))
759 {
760 return perfect_shift_end - 1;
761 }
762 else
763 {
764 return end_row;
765 }
766 }
767 catch(const pappso::PappsoException &err)
768 {
770 QObject::tr("perfectShiftPossibleEnd failed :\n%1").arg(err.qwhat()));
771 }
772}
773
777
783
789
790void
793 const SpOMSSpectrum &spectrum,
794 std::size_t offset)
795{
796 qDebug();
797 m_best_alignment.m_peptideModel.reset();
798 m_best_alignment.peaks.clear();
799 m_best_alignment.shifts.clear();
800 std::size_t previous_row; // FIXME : may be used uninitialised
801 std::size_t previous_column = 0;
802 std::size_t perfect_shift_end;
803 std::pair<std::vector<ScenarioCell>, int> best_alignment = m_scenario.getBestAlignment();
804 m_best_alignment.score = best_alignment.second;
805 std::vector<SpOMSAa> skipped_aa;
806 double skipped_mass;
807 // Retrieving beginning and end
808 if(best_alignment.first.front().previous_row > offset)
809 {
811 QString("best_alignment.first.front().previous_row > offset %1 %2")
812 .arg(offset)
813 .arg(best_alignment.first.front().previous_row));
814 }
815 if(best_alignment.first.back().previous_row > offset)
816 {
818 QString("best_alignment.first.back().previous_row > offset %1 %2")
819 .arg(offset)
820 .arg(best_alignment.first.back().previous_row));
821 }
822 m_best_alignment.beginning = offset - best_alignment.first.front().previous_row;
823 m_best_alignment.end = offset - best_alignment.first.back().previous_row - 1;
824
825 qDebug();
826 AminoAcidModel aa_model;
827 aa_model.m_massDifference = 0;
828 // Filling temp_interpretation and peaks vectors
829 for(auto cell : best_alignment.first)
830 {
831 switch(cell.alignment_type)
832 {
833 case AlignType::found:
834 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
835 aa_model.m_massDifference = 0;
836 aa_model.m_skipped = false;
837 m_best_alignment.m_peptideModel.push_back(aa_model);
838 if(previous_row > cell.previous_row + 1)
839 {
840 skipped_mass = sequence.at(previous_row - 1)
841 .mass; // Aa(sequence.at(previous_row - 1).unicode()).getMass();
842 skipped_aa =
843 sequence.sliced(cell.previous_row, previous_row - cell.previous_row - 1);
844 aa_model.m_massDifference = 0;
845 aa_model.m_skipped = true;
846 for(auto aa : skipped_aa)
847 {
848 aa_model.m_aminoAcid = aa.aa;
849 m_best_alignment.m_peptideModel.push_back(aa_model);
850 skipped_mass += aa.mass; // Aa(aa.unicode()).getMass();
851 }
852 m_best_alignment.m_peptideModel.back().m_massDifference =
853 spectrum.getMZShift(cell.previous_column, previous_column) - skipped_mass;
854 }
855 m_best_alignment.peaks.push_back(cell.previous_column);
856 break;
858 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
859 aa_model.m_massDifference = 0;
860 aa_model.m_skipped = true;
861 m_best_alignment.m_peptideModel.push_back(aa_model);
862 break;
863 case AlignType::shift:
864
865 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
866 aa_model.m_massDifference = spectrum.getMZShift(cell.previous_column, previous_column) -
867 aa_model.m_aminoAcid.getMass();
868 aa_model.m_skipped = false;
869 m_best_alignment.m_peptideModel.push_back(aa_model);
870 m_best_alignment.peaks.push_back(cell.previous_column);
871 m_best_alignment.shifts.push_back(
872 spectrum.getMZShift(cell.previous_column, previous_column) -
873 sequence.at(previous_row - 1).mass);
874 break;
876 m_best_alignment.peaks.push_back(cell.previous_column);
877 skipped_aa = sequence.sliced(cell.previous_row, previous_row - cell.previous_row);
878 std::reverse(skipped_aa.begin(), skipped_aa.end());
879 aa_model.m_massDifference = 0;
880 aa_model.m_skipped = false;
881 for(auto aa : skipped_aa)
882 {
883 aa_model.m_aminoAcid = aa.aa;
884 m_best_alignment.m_peptideModel.push_back(aa_model);
885 }
886 break;
887 case AlignType::init:
888 previous_row = cell.previous_row;
889 previous_column = cell.previous_column;
890 m_best_alignment.peaks.push_back(cell.previous_column);
891 break;
892 }
893 previous_row = cell.previous_row;
894 previous_column = cell.previous_column;
895 }
896 std::reverse(m_best_alignment.peaks.begin(), m_best_alignment.peaks.end());
897 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
898
899 qDebug();
900 // Compute begin_shift and end_shift
901 MzRange zero(0, m_precision_ptr);
902 m_best_alignment.begin_shift = spectrum.getMZShift(0, m_best_alignment.peaks.front());
903 m_best_alignment.end_shift = spectrum.getMissingMass(m_best_alignment.peaks.back());
904 if(zero.contains(m_best_alignment.end_shift))
905 {
906 m_best_alignment.end_shift = 0;
907 }
908
909 qDebug();
910 // Computing SPC
911 m_best_alignment.SPC = 0;
912 for(auto peak : m_best_alignment.peaks)
913 {
914 switch(spectrum.at(peak).type)
915 {
917 qDebug() << peak << "native";
918 m_best_alignment.SPC += 1;
919 break;
921 qDebug() << peak << "both";
922 m_best_alignment.SPC += 2;
923 break;
925 qDebug() << peak << "synthetic";
926 break;
928 qDebug() << peak << "symmetric";
929 m_best_alignment.SPC += 1;
930 break;
931 }
932 }
933
934 qDebug();
935 // Final check of the end shift
936 if(m_best_alignment.end_shift > 0)
937 {
938 perfect_shift_end = perfectShiftPossibleEnd(sequence,
939 spectrum,
940 best_alignment.first.front().previous_row,
941 m_best_alignment.peaks.back());
942 if(perfect_shift_end != best_alignment.first.front().previous_row)
943 {
944 skipped_aa =
945 sequence.sliced(best_alignment.first.front().previous_row,
946 perfect_shift_end - best_alignment.first.front().previous_row);
947 aa_model.m_massDifference = 0;
948 aa_model.m_skipped = true;
949 for(auto aa = skipped_aa.begin(); aa != skipped_aa.end(); aa++)
950 {
951 aa_model.m_aminoAcid = aa->aa;
952 m_best_alignment.m_peptideModel.push_back(aa_model);
953 }
954 m_best_alignment.beginning = offset - perfect_shift_end;
955 m_best_alignment.end_shift = 0;
956 }
957 else
958 {
960 }
961 }
962
963 qDebug();
964 // Writing final interpretation
965 if(m_best_alignment.end_shift > 0)
966 {
967 m_best_alignment.m_peptideModel.setNterShift(m_best_alignment.end_shift);
968 }
969
970 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
971 if(m_best_alignment.begin_shift > 0)
972 {
973 m_best_alignment.m_peptideModel.setCterShift(m_best_alignment.begin_shift);
974 }
975
976 m_best_alignment.m_peptideModel.setPrecursorMass(spectrum.getPrecursorMass());
977 qDebug();
978}
979
985
986std::vector<double>
988 const Alignment &alignment,
989 const QString &protein_seq)
990{
991 // qDebug() << protein_seq;
992 if(alignment.end > (std::size_t)protein_seq.size())
993 {
994 throw pappso::ExceptionOutOfRange(QString("alignment.end > protein_seq.size() %1 %2")
995 .arg(alignment.end)
996 .arg(protein_seq.size()));
997 }
998 std::vector<double> potential_mass_errors(alignment.shifts);
999 double shift = alignment.end_shift;
1000 std::size_t index;
1001 if(alignment.beginning > 0)
1002 { // -1 on unsigned int makes it wrong
1003 index = alignment.beginning - 1;
1004 while(shift > 0 && index > 0)
1005 {
1006 potential_mass_errors.push_back(shift);
1007 // qDebug() << " shift=" << shift << " index=" << index
1008 // << " letter=" << protein_seq.at(index).toLatin1();
1009 shift -= aa_code.getMass(
1010 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1011 index--;
1012 }
1013 }
1014
1015 // qDebug() << "second";
1016 shift = alignment.begin_shift;
1017 index = alignment.end + 1;
1018 while(shift > 0 && index < (std::size_t)protein_seq.size())
1019 {
1020 potential_mass_errors.push_back(shift);
1021 qDebug() << " shift=" << shift << " index=" << index
1022 << " letter=" << protein_seq.at(index).toLatin1();
1023 shift -= aa_code.getMass(
1024 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1025 index++;
1026 }
1027 // qDebug();
1028 return potential_mass_errors;
1029}
1030
1031bool
1033 std::size_t window,
1034 std::size_t minimum_aa_diversity)
1035{
1036 qDebug() << "sequence=" << sequence << " window=" << window
1037 << " minimum_aa_diversity=" << minimum_aa_diversity;
1038 if(sequence.size() < window)
1039 return false;
1040 auto it_begin = sequence.begin();
1041 auto it_end = sequence.begin() + window;
1042 QString window_copy(sequence.mid(0, window));
1043 while(it_end != sequence.end())
1044 {
1045 std::partial_sort_copy(it_begin, it_end, window_copy.begin(), window_copy.end());
1046
1047 qDebug() << window_copy;
1048 std::size_t uniqueCount =
1049 std::unique(window_copy.begin(), window_copy.end()) - window_copy.begin();
1050
1051 qDebug() << uniqueCount;
1052 if(uniqueCount < minimum_aa_diversity)
1053 return false;
1054 it_begin++;
1055 it_end++;
1056 }
1057 return true;
1058}
1059
1060const std::vector<pappso::specpeptidoms::KeyCell> &
1063 const std::size_t row_number,
1064 const std::vector<AaPosition> &aa_positions,
1065 const SpOMSSpectrum &spectrum,
1066 const bool fast_align,
1067 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
1068{
1069 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, fast_align, protein_ptr);
1070 return getInterestCells();
1071}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
double getMass(uint8_t aa_code) const
get the mass of the amino acid given its integer code the amino acid can bear some modification (if a...
Definition aacode.cpp:224
pappso_double getMass() const override
Definition aa.cpp:90
bool contains(pappso_double) const
Definition mzrange.cpp:116
virtual const QString & qwhat() const
std::vector< std::vector< std::size_t > > getPeaks() const
void addPeaks(std::size_t peak1, std::size_t peak2)
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & oneAlignStep(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
function made for testing the fastAlign process, process one line and return the alignment matrix
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak....
void initFastAlign(const SpOMSSpectrum &spectrum)
function made for testing the fastAlign process, initiate the variables for alignment
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
SemiGlobalAlignment(const ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, const AaCode &aaCode)
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::vector< SpOMSAa > sliced(std::size_t position, std::size_t length) const
double getMZShift(std::size_t l_peak, std::size_t r_peak) const
Returns the mz difference between two peaks.
uint getPrecursorCharge() const
Returns the spectrum's precursor's charge.
double getMissingMass(std::size_t peak) const
Returns the missing mass between a peak and the precursor's mass (shift at the end).
std::size_t getComplementaryPeak(std::size_t peak) const
const std::vector< AaPosition > & getAaPositions(std::uint8_t aa_code) const
Returns the list of aa_positions for a given amino acid code.
specglob::ExperimentalSpectrumDataPointType peakType(std::size_t indice) const
Returns the type of one of the spectrum's peaks.
@ synthetic
does not correspond to existing peak, for computational purpose
Definition types.h:85
@ both
both, the ion and the complement exists in the original spectrum
Definition types.h:83
@ symmetric
new peak : computed symmetric mass from a corresponding native peak
Definition types.h:81
const uint ALIGNMENT_SURPLUS(5)
const int MIN_ALIGNMENT_SCORE(15)
const uint TOL_PEAKS_MISSING(4)
const uint TOL_PEAKS_MISSING_FIRST_COLUMN(5)
const PrecisionBase * PrecisionPtr
Definition precision.h:122
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence