ReadLikelihoodCalculationEngine
是 GATK(Genome Analysis Toolkit)中的一个接口,用于计算不同等位基因(haplotypes 或 alleles)下的测序读数的似然值。这些似然值在变异检测过程中起着关键作用,帮助确定哪些等位基因更可能是真实的遗传变异。
主要功能
ReadLikelihoodCalculationEngine
的主要功能是根据给定的测序读数数据、参考序列以及可能的等位基因来计算每个读数在每种可能的等位基因下的似然值。这些似然值可以帮助识别和过滤出真实的变异。
主要方法
computeReadLikelihoods:
计算每个读数在每个候选等位基因下的似然值。输出: 返回 AlleleLikelihoods<GATKRead, Haplotype>
对象,包含读数与等位基因的似然值矩阵。
使用场景
ReadLikelihoodCalculationEngine
在许多 GATK 的变异检测管道中扮演重要角色,特别是在体细胞突变检测(如 Mutect2)或常规的 SNP/Indel 变异检测中。
实现类:
FlowBasedAlignmentLikelihoodEngine源码
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;import htsjdk.samtools.SAMFileHeader;
import org.apache.commons.math3.util.Precision;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.*;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.function.ToDoubleFunction;
import java.util.stream.IntStream;/*** Flow based replacement for PairHMM likelihood calculation. Likelihood calculation all-vs-all flow based reads* and haplotypes.** see {@link #haplotypeReadMatching(FlowBasedHaplotype, FlowBasedRead)}*/
public class FlowBasedAlignmentLikelihoodEngine implements ReadLikelihoodCalculationEngine {public static final double MAX_ERRORS_FOR_READ_CAP = 3.0;public static final double MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP = 2.0;// for the two most common probability values, we pre-compute log10 to save runtime.public static final double COMMON_PROB_VALUE_1 = 0.988;public static final double COMMON_PROB_VALUE_2 = 0.001;public static final double COMMON_PROB_VALUE_1_LOG10 = Math.log10(COMMON_PROB_VALUE_1);public static final double COMMON_PROB_VALUE_2_LOG10 = Math.log10(COMMON_PROB_VALUE_2);private double log10globalReadMismappingRate;private final double expectedErrorRatePerBase;private final boolean symmetricallyNormalizeAllelesToReference;private static final int ALIGNMENT_UNCERTAINTY = 4;final FlowBasedAlignmentArgumentCollection fbargs;private final Logger logger = LogManager.getLogger(this.getClass());private final boolean dynamicReadDisqualification;private final double readDisqualificationScale;private ForkJoinPool threadPool;/*** Default constructor* @param fbargs - arguments* @param log10globalReadMismappingRate - probability for wrong mapping (maximal contribution of the read to data likelihood)* @param expectedErrorRatePerBase - the expected rate of random sequencing errors for a read originating from its true haplotype.*/public FlowBasedAlignmentLikelihoodEngine(final FlowBasedAlignmentArgumentCollection fbargs,final double log10globalReadMismappingRate,final double expectedErrorRatePerBase,final boolean dynamicReadDisqualification, final double readDisqualificationScale) {this.fbargs = fbargs;this.log10globalReadMismappingRate = log10globalReadMismappingRate;this.expectedErrorRatePerBase = expectedErrorRatePerBase;this.readDisqualificationScale = readDisqualificationScale;this.dynamicReadDisqualification = dynamicReadDisqualification;this.symmetricallyNormalizeAllelesToReference = true;if ( this.fbargs.flowLikelihoodParallelThreads > 0 ) {threadPool = new ForkJoinPool(this.fbargs.flowLikelihoodParallelThreads);}}/*** Read/haplotype likelihood calculation for all samples* @param samples the list of targeted samples.* @param perSampleReadList the input read sets stratified per sample.** @param filterPoorly - if the poorly modeled reads should be removed* @return*/public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<Haplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList, final boolean filterPoorly) {return computeReadLikelihoods(haplotypeList, hdr, samples, perSampleReadList,filterPoorly, true);}/*** Read/haplotype likelihood calculation for all samples* @param samples the list of targeted samples.* @param perSampleReadList the input read sets stratified per sample.** @param filterPoorly - if the poorly modeled reads should be removed* @param normalizeLikelihoods - if the likelihood of each read should be normalized to the highest* @returnd*/public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<Haplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList,final boolean filterPoorly, final boolean normalizeLikelihoods) {Utils.nonNull(samples, "samples is null");Utils.nonNull(perSampleReadList, "perSampleReadList is null");Utils.nonNull(haplotypeList, "haplotypeList is null");final AlleleList<Haplotype> haplotypes = new IndexedAlleleList<>(haplotypeList);// Add likelihoods for each sample's reads to our resultfinal AlleleLikelihoods<GATKRead, Haplotype> result = new AlleleLikelihoods<>(samples, haplotypes, perSampleReadList);final int sampleCount = result.numberOfSamples();for (int i = 0; i < sampleCount; i++) {computeReadLikelihoods(result.sampleMatrix(i), hdr);}if (normalizeLikelihoods) {result.normalizeLikelihoods(log10globalReadMismappingRate, symmetricallyNormalizeAllelesToReference);}if (filterPoorly) {filterPoorlyModeledEvidence(result, dynamicReadDisqualification, expectedErrorRatePerBase, readDisqualificationScale);}return result;}/** Calculate minimal likelihood that reasonably matching read can get. We divide errors into "expected", e.g.* hmer indels without 0->1/1->0 errors. Those on average occur with expectedErrorRate and "catastrophic' e.g.* 1->0 and 0->1 errors (or large hmer indels). Those occur with probability fbargs.fillingValue.* If the read has more than 3 expected and more than 2 "catastrophic" errors it will probably be deemed unfit to the* haplotype** @param expectedErrorRate error rate for expected errors.* @return minimal likelihood for the read to be considered not poorly modeled*/@Overridepublic ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {final double log10ErrorRate = Math.log10(expectedErrorRate);final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors.final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);return read -> {final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate);final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) :Math.ceil(read.getLength() * largeEventErrorRate);return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate;};}/*** Compute read likelihoods for a single sample* @param likelihoods Single sample likelihood matrix* @param hdr SAM header that corresponds to the sample*/protected void computeReadLikelihoods(final LikelihoodMatrix<GATKRead, Haplotype> likelihoods,final SAMFileHeader hdr) {final List<FlowBasedRead> processedReads = new ArrayList<>(likelihoods.evidenceCount());final List<FlowBasedHaplotype> processedHaplotypes = new ArrayList<>(likelihoods.numberOfAlleles());// establish flow order based on the first evidence. Note that all reads belong to the same sample (group)final FlowBasedReadUtils.ReadGroupInfo rgInfo = (likelihoods.evidenceCount() != 0)? FlowBasedReadUtils.getReadGroupInfo(hdr, likelihoods.evidence().get(0)): null;final String flowOrder = (rgInfo != null)? rgInfo.flowOrder.substring(0, fbargs.flowOrderCycleLength): FlowBasedReadUtils.findFirstUsableFlowOrder(hdr, fbargs);//convert all reads to FlowBasedReads (i.e. parse the matrix of P(call | haplotype) for each read from the BAM)for (int i = 0 ; i < likelihoods.evidenceCount(); i++) {final GATKRead rd = likelihoods.evidence().get(i);final FlowBasedRead fbRead = new FlowBasedRead(rd, flowOrder, rgInfo.maxClass, fbargs);fbRead.applyAlignment();processedReads.add(fbRead);}for (int i = 0; i < likelihoods.numberOfAlleles(); i++){final FlowBasedHaplotype fbh = new FlowBasedHaplotype(likelihoods.alleles().get(i), flowOrder);processedHaplotypes.add(fbh);}if (fbargs.trimToHaplotype) {//NOTE: we assume all haplotypes start and end on the same place!final int haplotypeStart = processedHaplotypes.get(0).getStart();final int haplotypeEnd = processedHaplotypes.get(0).getEnd();for (int i = 0; i < processedReads.size(); i++) {final FlowBasedRead fbr = processedReads.get(i);final int readStart = fbr.getStart();final int readEnd = fbr.getEnd();final int diffLeft = haplotypeStart - readStart;final int diffRight = readEnd - haplotypeEnd;//It is rare that this function is applied, maybe just some boundary cases//in general reads are already trimmed to the haplotype starts and ends so diff_left <= 0 and diff_right <= 0fbr.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), true);}}for (int i = 0; i < likelihoods.numberOfAlleles(); i++){final FlowBasedHaplotype fbh = processedHaplotypes.get(i);if ( threadPool != null ) {final int haplotypeIndex = i;Callable<Void> callable = () -> {IntStream.range(0, likelihoods.evidenceCount()).parallel().forEach(j -> {final double likelihood = fbargs.exactMatching ? haplotypeReadMatchingExactLength(fbh, processedReads.get(j)) : haplotypeReadMatching(fbh, processedReads.get(j));likelihoods.set(haplotypeIndex, j, likelihood);});return null;};try {threadPool.submit(callable).get();} catch (InterruptedException | ExecutionException e) {throw new RuntimeException(e);}} else {for (int j = 0; j < likelihoods.evidenceCount(); j++) {final double likelihood = fbargs.exactMatching ? haplotypeReadMatchingExactLength(fbh, processedReads.get(j)) : haplotypeReadMatching(fbh, processedReads.get(j));likelihoods.set(i, j, likelihood);}}}}/*** Aligns single read to a single haplotype. The alignment process is based on the assumption that the errors are only* changes in the flow calls. Thus, the length of the haplotype in flow space should be equal to the length of the read* in the flow space (on the region where the read aligns).** Example: align read ACGGT* to haplotype GACG-TA. In flow space we see that the overlapping sequences ACGGT and ACGT are of the same length* and we just need to calculate P(R|H) = \prod P(Ri | Hi) for ith flow. Note that the flow matrix of the flow based* read {@link FlowBasedRead} allows for trivial calculation of this product.** As an additional optimization we note that almost always the interval to be aligned corresponds to the overlap* of the read and the haplotype to the reference. We allow for some uncertainty in this, especially when the read* falls inside the deletion in the haplotype relative to the reference.** @param haplotype FlowBasedHaplotype single haplotype* @param read FlowBasedRead single read (trimmed to the haplotype)* @return a score for the alignment of the read to the haplotype. The higher the score the "better" the alignment* normally, scores will be negative.* @throws GATKException*/public double haplotypeReadMatching(final FlowBasedHaplotype haplotype, final FlowBasedRead read) throws GATKException {if (read.getDirection() != FlowBasedRead.Direction.REFERENCE ) {throw new GATKException.ShouldNeverReachHereException("Read should be aligned with the reference");}if (!read.isBaseClipped() && fbargs.trimToHaplotype) {throw new GATKException.ShouldNeverReachHereException("Reads should be trimmed to the haplotype");}if (!read.isValid()) {return Double.NEGATIVE_INFINITY;}// the read is assumed to be trimmed to the haplotype by ReadThreadingAssembler.finalizeRegion, the region of the// haplotype to be aligned is estimated by finding the points on the haplotypes that align to the start and the end// of the read on the referencefinal int haplotypeStart = ReadUtils.getReadIndexForReferenceCoordinate(haplotype.getStart(), haplotype.getCigar(),read.getTrimmedStart()).getLeft();final int haplotypeEnd = ReadUtils.getReadIndexForReferenceCoordinate(haplotype.getStart(), haplotype.getCigar(),read.getTrimmedEnd()).getLeft();final int haplotypeLength = haplotypeEnd - haplotypeStart;final int readLength = read.seqLength();//in case there is a deletion on the haplotype and hte read falls inside the deletion (thus length of the read is// longer than the length of the trimmed haplotype:// Reference: GACACACACACT// Read: ACACT// Haplotype GACAC------T// In this case the read (that is not informative) is aligned inside the deletion and thus if we clip the haplotype// to the start position of the read will not support it (haplotype: T, read: ACACT), so in cases we see that the resulting// haploype length is shorter than the resulting read length we extend the "trimmed haplotype" by this length and// also allow more uncertainty for the actual starting position of the alignmentfinal int uncertainty = Math.max(readLength - haplotypeLength,0);final int leftClip = Math.max(haplotypeStart-uncertainty,0);final int rightClip = Math.max(haplotype.length()-haplotypeEnd-1-uncertainty,0);if ((leftClip >= haplotype.length() ) || ( rightClip >= haplotype.length())) {return Double.NEGATIVE_INFINITY;}int [] leftClipping = haplotype.findLeftClipping(leftClip);int clipLeft = leftClipping[0];final int leftHmerClip = leftClipping[1];leftClipping = haplotype.findRightClipping(rightClip);int clipRight = leftClipping[0];final int rightHmerClip = leftClipping[1];if ((clipLeft >= haplotype.getKeyLength()) || (clipRight >= haplotype.getKeyLength())){return Double.NEGATIVE_INFINITY;}if ((leftHmerClip <0) | (rightHmerClip < 0)) {throw new GATKException.ShouldNeverReachHereException("Negative hmer clips found. Check");}final int originalLength = haplotype.getKeyLength();clipLeft = clipLeft-ALIGNMENT_UNCERTAINTY>0 ? clipLeft-ALIGNMENT_UNCERTAINTY:0;clipRight = originalLength - clipRight + ALIGNMENT_UNCERTAINTY < originalLength ?originalLength - clipRight+ALIGNMENT_UNCERTAINTY : originalLength;return fbargs.flowLikelihoodOptimizedComp? optimizedFlowLikelihoodScore(haplotype, read, clipLeft, clipRight): flowLikelihoodScore(haplotype, read, clipLeft, clipRight);}/*** Aligns single read to a single haplotype. The alignment process is based on the assumption that the errors are only* changes in the flow calls. Thus, the length of the haplotype in flow space should be equal to the length of the read* in the flow space (on the region where the read aligns).** This function is very similar to haplotypeReadMatching, but the assumption is that the read is equal to the haplotype rather than* partially covering the haplotype (e.g. exome sequencing)* @param haplotype FlowBasedHaplotype single haplotype* @param read FlowBasedRead single read (trimmed to the haplotype)* @return a score for the alignment of the read to the haplotype. The higher the score the "better" the alignment* normally, scores will be negative.* @throws GATKException*/public double haplotypeReadMatchingExactLength(final FlowBasedHaplotype haplotype, final FlowBasedRead read) throws GATKException {if (read.getDirection() != FlowBasedRead.Direction.REFERENCE ) {throw new GATKException.ShouldNeverReachHereException("Read should be aligned with the reference");}if (!read.isBaseClipped() && fbargs.trimToHaplotype) {throw new GATKException.ShouldNeverReachHereException("Reads should be trimmed to the haplotype");}if (!read.isValid()) {return Double.NEGATIVE_INFINITY;}return fbargs.flowLikelihoodOptimizedComp? optimizedFlowLikelihoodScore(haplotype, read, 0,haplotype.getKeyLength()): flowLikelihoodScore(haplotype, read, 0, haplotype.getKeyLength());}/** calculate likelihood score between a read and a haplotype. This code is the non-optimnized (original) version* of the calculation code, as extracted from haplotypeReadMatching*/private double flowLikelihoodScore(FlowBasedHaplotype haplotype, FlowBasedRead read, int clipLeft, int clipRight) {final int[] key;key = Arrays.copyOfRange(haplotype.getKey(), clipLeft, clipRight);final byte[] flowOrder;flowOrder = Arrays.copyOfRange(haplotype.getFlowOrderArray(), clipLeft, clipRight);int startingPoint = 0;for (int i = 0; i < flowOrder.length; i++) {if (flowOrder[i] == read.getFlowOrderArray()[0]) {startingPoint = i;break;}}// this is the heart of the calculation of the likelihood. Since we assume in our model that between the// read and the haplotype there are no insertions / deletions of flows, we just search for the best starting// position of the read on the haplotype and then to calculate P(R|H) we just select the corresponding probabilities// from the flow matrix.// Another important optimization is that the best starting position of the read on the haplotype can be calculated// from the starting position of the read on the reference and the alignment of the haplotype to the reference.// This is why we trim the haplotype so that the aligned intervals have the same length as the read and allow a// small uncertainty.double bestAlignment = Double.NEGATIVE_INFINITY;for (int s = startingPoint; (s + read.getKeyLength() <= key.length); s += ALIGNMENT_UNCERTAINTY) {final int[] locationsToFetch = new int[read.getKeyLength()];for (int i = s; i < s + read.getKeyLength(); i++) {locationsToFetch[i - s] = key[i] & 0xff;locationsToFetch[i - s] = locationsToFetch[i - s] < read.getMaxHmer() + 1 ?locationsToFetch[i - s] : read.getMaxHmer() + 1;}double result = 0;for (int i = 0; i < locationsToFetch.length; i++) {final double prob;result += Math.log10(prob = read.getProb(i, locationsToFetch[i]));if (logger.isDebugEnabled()) {logger.debug("prob:" + read.getName() + " " + i + " " + locationsToFetch[i] + " " + prob);}}if (result > bestAlignment) {bestAlignment = result;}}return bestAlignment;}/** calculate likelihood score between a read and a haplotype. This code is based on original version* of the calculation code, as extracted from haplotypeReadMatching.** It code is optimized in that it performs fewer log10 calls - which are expensive - by using precomputed values* for common probability values.*/private double optimizedFlowLikelihoodScore(FlowBasedHaplotype haplotype, FlowBasedRead read, int clipLeft, int clipRight) {int[] key_o = haplotype.getKey();int key_ostart = clipLeft;int key_olen = clipRight - clipLeft;byte[] flow_order_o = haplotype.getFlowOrderArray();int flow_order_ostart = clipLeft;int flow_order_olen = clipRight - clipLeft;byte read_flow_0 = read.getFlowOrderArray()[0];int startingPoint = 0;for (int i = 0; i < flow_order_olen; i++) {if (flow_order_o[i+flow_order_ostart] == read_flow_0) {startingPoint = i;break;}}// this is the heart of the calculation of the likelihood. Since we assume in our model that between the// read and the haplotype there are no insertions / deletions of flows, we just search for the best starting// position of the read on the haplotype and then to calculate P(R|H) we just select the corresponding probabilities// from the flow matrix.// Another important optimization is that the best starting position of the read on the haplotype can be calculated// from the starting position of the read on the reference and the alignment of the haplotype to the reference.// This is why we trim the haplotype so that the aligned intervals have the same length as the read and allow a// small uncertainty.double bestAlignment = Double.NEGATIVE_INFINITY;int read_maxhmer_1 = read.getMaxHmer() + 1;int read_key_len = read.getKeyLength();final int[] locationsToFetch = new int[read_key_len];for (int s = startingPoint; (s + read_key_len <= key_olen); s += ALIGNMENT_UNCERTAINTY) {for (int i = s; i < s + read_key_len; i++) {if ( (locationsToFetch[i - s] = key_o[i+key_ostart] & 0xff) >= read_maxhmer_1 )locationsToFetch[i - s] = read_maxhmer_1;}double result = 0;for (int i = 0; i < read_key_len; i++) {final double prob = read.getProb(i, locationsToFetch[i]);double log10;if ( Precision.equals(prob, COMMON_PROB_VALUE_1) )log10 = COMMON_PROB_VALUE_1_LOG10;else if ( Precision.equals(prob, COMMON_PROB_VALUE_2) )log10 = COMMON_PROB_VALUE_2_LOG10;elselog10 = Math.log10(prob);result += log10;if (logger.isDebugEnabled())logger.debug("prob:" + read.getName() + " " + i + " " + locationsToFetch[i] + " " + prob);// shortcut - result will never riseif ( result < bestAlignment )break;}if (result > bestAlignment) {bestAlignment = result;}}return bestAlignment;}@Overridepublic void close() {}public double getLog10globalReadMismappingRate() {return log10globalReadMismappingRate;}public double getExpectedErrorRatePerBase() {return expectedErrorRatePerBase;}public boolean isSymmetricallyNormalizeAllelesToReference() {return symmetricallyNormalizeAllelesToReference;}
}
FlowBasedHMMEngine源码
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import org.broadinstitute.hellbender.utils.genotyper.*;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pairhmm.FlowBasedPairHMM;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMMInputScoreImputation;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMMInputScoreImputator;
import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.ToDoubleFunction;/*** Flow Based HMM, intended to incorporate the scoring model of the {@link FlowBasedAlignmentLikelihoodEngine} while allowing for frame-shift insertions* and deletions for better genotyping.*/
public class FlowBasedHMMEngine implements ReadLikelihoodCalculationEngine {private final double readDisqualificationScale;private final boolean dynamicReadDisqualification;private double log10globalReadMismappingRate;private final double expectedErrorRatePerBase;private PairHMMLikelihoodCalculationEngine.PCRErrorModel pcrErrorModel;final FlowBasedArgumentCollection fbargs;@VisibleForTestingpublic static final double INITIAL_QSCORE = 40.0;private final FlowBasedPairHMM flowPairHMM;public static final byte MIN_USABLE_Q_SCORE_DEFAULT = 6;private static final int MIN_ADJUSTED_QSCORE = 10;private byte minUsableIndelScoreToUse;private PairHMMInputScoreImputator inputScoreImputator;private final DragstrParams dragstrParams;private byte constantGCP;private final byte flatInsertionPenalty;private final byte flatDeletionPenalty;/*** Default constructor* @param fbargs - arguments* @param log10globalReadMismappingRate - probability for wrong mapping (maximal contribution of the read to data likelihood)* @param expectedErrorRatePerBase - the expected rate of random sequencing errors for a read originating from its true haplotype.* @param pcrErrorModel*/public FlowBasedHMMEngine(final FlowBasedArgumentCollection fbargs, final byte constantGCP, final double log10globalReadMismappingRate, final double expectedErrorRatePerBase,final PairHMMLikelihoodCalculationEngine.PCRErrorModel pcrErrorModel, final DragstrParams dragstrParams, final boolean dynamicReadDisqualification, final double readDisqualificationScale,final int minUsableIndelScoreToUse, final byte flatDeletionPenalty, final byte flatInsertionPenalty) {this.fbargs = fbargs;this.log10globalReadMismappingRate = log10globalReadMismappingRate;this.expectedErrorRatePerBase = expectedErrorRatePerBase;this.readDisqualificationScale = readDisqualificationScale;this.dynamicReadDisqualification = dynamicReadDisqualification;this.pcrErrorModel = pcrErrorModel;this.flowPairHMM = new FlowBasedPairHMM();this.dragstrParams = dragstrParams;this.constantGCP = constantGCP;this.flatDeletionPenalty = flatDeletionPenalty;this.flatInsertionPenalty = flatInsertionPenalty;this.minUsableIndelScoreToUse = (byte)minUsableIndelScoreToUse;// validity checksif (fbargs.flowOrderCycleLength != 4) {throw new GATKException("FlowBasedHMMEngine requires flow order of 4 elements but cycle length is specified as " + fbargs.flowOrderCycleLength);}initializePCRErrorModel();}public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<Haplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList,final boolean filterPoorly){return computeReadLikelihoods(haplotypeList, hdr, samples, perSampleReadList, filterPoorly, true);}public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<Haplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList,final boolean filterPoorly,final boolean normalizeLikelihoods) {Utils.nonNull(samples, "samples is null");Utils.nonNull(perSampleReadList, "perSampleReadList is null");Utils.nonNull(haplotypeList, "haplotypeList is null");final AlleleList<Haplotype> haplotypes = new IndexedAlleleList<>(haplotypeList);// Add likelihoods for each sample's reads to our resultfinal AlleleLikelihoods<GATKRead, Haplotype> result = new AlleleLikelihoods<>(samples, haplotypes, perSampleReadList);final int sampleCount = result.numberOfSamples();for (int i = 0; i < sampleCount; i++) {computeReadLikelihoods(result.sampleMatrix(i), hdr);}if (normalizeLikelihoods) {result.normalizeLikelihoods(log10globalReadMismappingRate, true);}if ( filterPoorly ) {filterPoorlyModeledEvidence(result, dynamicReadDisqualification, expectedErrorRatePerBase, readDisqualificationScale);}return result;}/** Calculate minimal likelihood that reasonably matching read can get. We divide errors into "expected", e.g.* hmer indels without 0->1/1->0 errors. Those on average occur with expectedErrorRate and "catastrophic' e.g.* 1->0 and 0->1 errors (or large hmer indels). Those occur with probability fbargs.fillingValue.* If the read has more than 3 expected and more than 2 "catastrophic" errors it will probably be deemed unfit to the* haplotype** @param expectedErrorRate error rate for expected errors.* @return minimal likelihood for the read to be considered not poorly modeled*/@Overridepublic ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {final double log10ErrorRate = Math.log10(expectedErrorRate);final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors.final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);return read -> {final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate));final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate));return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate;};}private byte[] pcrIndelErrorModelCache;private void initializePCRErrorModel() {inputScoreImputator = dragstrParams == null? NonSymmetricalPairHMMInputScoreImputator.newInstance(constantGCP, flatInsertionPenalty, flatDeletionPenalty): DragstrPairHMMInputScoreImputator.of(dragstrParams) ;if ( !pcrErrorModel.hasRateFactor() ) {return;}pcrIndelErrorModelCache = new byte[ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH + 1];final double rateFactor = pcrErrorModel.getRateFactor();for( int i = 0; i <= ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH; i++ ) {pcrIndelErrorModelCache[i] = getErrorModelAdjustedQual(i, rateFactor, minUsableIndelScoreToUse);}}// TODO these methods are ripped whole cloth from the PairHMMLikelihoodsCalculationEngine and should be uinfied/fixed if possible@VisibleForTestingvoid applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {if ( pcrErrorModel == PairHMMLikelihoodCalculationEngine.PCRErrorModel.NONE ) {return;}for ( int i = 1; i < readBases.length; i++ ) {final int repeatLength = ReadLikelihoodCalculationEngine.findTandemRepeatUnits(readBases, i-1).getRight();readInsQuals[i-1] = (byte) Math.min(0xff & readInsQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);readDelQuals[i-1] = (byte) Math.min(0xff & readDelQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);}}private static void capMinimumReadIndelQualities(final byte[] readInsQuals, final byte[] readDelQuals, final byte minUsableQualScore) {for( int i = 0; i < readInsQuals.length; i++ ) {readInsQuals[i] = (byte)Math.max(readInsQuals[i], minUsableQualScore);readDelQuals[i] = (byte)Math.max(readDelQuals[i], minUsableQualScore);}}static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor, final byte minUsableIndelScoreToUse) {return (byte) Math.max(minUsableIndelScoreToUse, MathUtils.fastRound(INITIAL_QSCORE - Math.exp(repeatLength / (rateFactor * Math.PI)) + 1.0));}/*** Initialize our flowPairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate** After calling this routine the PairHMM will be configured to best evaluate all reads in the samples* against the set of haplotypes* @param haplotypes a non-null list of haplotypes* @param perSampleReadList a mapping from sample -> reads*/private void initializeFlowPairHMM(final List<FlowBasedHaplotype> haplotypes, final List<FlowBasedRead> perSampleReadList) {initializePCRErrorModel();final int readMaxLength = perSampleReadList.stream().mapToInt(FlowBasedRead::getKeyLength).max().orElse(0);final int haplotypeMaxLength = haplotypes.stream().mapToInt(h -> h.getKeyLength()).max().orElse(0);// initialize arrays to hold the probabilities of being in the match, insertion and deletion casesflowPairHMM.initialize(readMaxLength, haplotypeMaxLength);;}/*** Compute read likelihoods for a single sample* @param likelihoods Single sample likelihood matrix* @param hdr SAM header that corresponds to the sample*/private void computeReadLikelihoods(final LikelihoodMatrix<GATKRead, Haplotype> likelihoods,final SAMFileHeader hdr) {final List<FlowBasedRead> processedReads = new ArrayList<>(likelihoods.evidenceCount());final List<FlowBasedHaplotype> processedHaplotypes = new ArrayList<>(likelihoods.numberOfAlleles());// establish flow order based on the first evidence. Note that all reads belong to the same sample (group)final FlowBasedReadUtils.ReadGroupInfo rgInfo = (likelihoods.evidenceCount() != 0)? FlowBasedReadUtils.getReadGroupInfo(hdr, likelihoods.evidence().get(0)): null;final String flowOrder = (rgInfo != null)? rgInfo.flowOrder.substring(0, fbargs.flowOrderCycleLength): FlowBasedReadUtils.findFirstUsableFlowOrder(hdr, fbargs);//convert all reads to FlowBasedReads (i.e. parse the matrix of P(call | haplotype) for each read from the BAM)for (int i = 0 ; i < likelihoods.evidenceCount(); i++) {final GATKRead rd = likelihoods.evidence().get(i);// create a flow based readfinal FlowBasedRead fbRead = new FlowBasedRead(rd, flowOrder, rgInfo.maxClass, fbargs);fbRead.applyAlignment();//TODO This currently supports any ScoreImputator in GATK but will probably need a custom one to handle flow based data in the future:final PairHMMInputScoreImputation inputScoreImputation = inputScoreImputator.impute(fbRead);final byte[] readInsQuals = inputScoreImputation.insOpenPenalties();final byte[] readDelQuals = inputScoreImputation.delOpenPenalties();final byte[] overallGCP = inputScoreImputation.gapContinuationPenalties();applyPCRErrorModel(fbRead.getBases(), readInsQuals, readDelQuals);capMinimumReadIndelQualities(readInsQuals, readDelQuals, minUsableIndelScoreToUse);fbRead.setReadInsQuals(readInsQuals);fbRead.setReadDelQuals(readDelQuals);fbRead.setOverallGCP(overallGCP);processedReads.add(fbRead);}//same for the haplotypes - each haplotype is converted to FlowBasedHaplotypefor (int i = 0; i < likelihoods.numberOfAlleles(); i++){final FlowBasedHaplotype fbh = new FlowBasedHaplotype(likelihoods.alleles().get(i), flowOrder);processedHaplotypes.add(fbh);}initializeFlowPairHMM(processedHaplotypes, processedReads);flowPairHMM.computeLog10LikelihoodsFlowBased(likelihoods, processedReads, processedHaplotypes);}@Overridepublic void close() {}}
PDPairHMMLikelihoodCalculationEngine源码
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.gatk.nativebindings.pairhmm.PairHMMNativeArguments;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.genotyper.*;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype;
import org.broadinstitute.hellbender.utils.pairhmm.LoglessPDPairHMM;
import org.broadinstitute.hellbender.utils.pairhmm.PDPairHMM;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMMInputScoreImputator;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;import java.io.OutputStreamWriter;
import java.util.*;
import java.util.function.ToDoubleFunction;
import java.util.stream.Collectors;/*** Classic likelihood computation: full pair-hmm all haplotypes vs all reads.** Note this is the "Partially Determined" Pair hmm engine, which means the haplotypes it sees are expected to be* PartiallyDeterminedHaplotype objects. These are special in that they all have a subset of "determined" or present* bases corresponding to variants and everything else is "undetermined." The likelihood scores are computed by taking* the maximum possible score for the read vs the un-determined bases and behaving normally over the determined ones* that have a real subset of alleles from the assembly region.** Since the operations and methods differ from the normal PairHMM significantly we have opted to create an entirely seperate* codepath of PD___HMM classes to handle the differences.*/
public final class PDPairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine {private static final int MIN_ADJUSTED_QSCORE = 10;@VisibleForTestingstatic final double INITIAL_QSCORE = 40.0;public static final String HMM_BASE_QUALITIES_TAG = "HMMQuals";private final byte constantGCP;private final double log10globalReadMismappingRate;private final PDPairHMM pdPairHMM;public static final String UNCLIPPED_ORIGINAL_SPAN_ATTR = "originalAlignment";// DRAGEN-GATK related parametersprivate final DragstrParams dragstrParams;private final boolean dynamicDisqualification;private final double readDisqualificationScale;private final double expectedErrorRatePerBase;private final boolean disableCapReadQualitiesToMapQ;private final boolean symmetricallyNormalizeAllelesToReference;private final boolean modifySoftclippedBases;private final int rangeForReadOverlapToDeterminedBases;private final PairHMMLikelihoodCalculationEngine.PCRErrorModel pcrErrorModel;private final byte baseQualityScoreThreshold;/*** Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations** @param constantGCP the gap continuation penalty to use with the PairHMM* @param hmmType the type of the HMM to use* @param resultsFile output file to dump per-read, per-haplotype inputs and outputs for debugging purposes (null if not enabled).* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of* -3 means that the chance that a read doesn't actually belong at this* location in the genome is 1 in 1000. The effect of this parameter is* to cap the maximum likelihood difference between the reference haplotype* and the best alternative haplotype by -3 log units. So if the best* haplotype is at -10 and this parameter has a value of -3 then even if the* reference haplotype gets a score of -100 from the pairhmm it will be* assigned a likelihood of -13.* @param pcrErrorModel model to correct for PCR indel artifacts* @param baseQualityScoreThreshold Base qualities below this threshold will be reduced to the minimum usable base* quality.*/public PDPairHMMLikelihoodCalculationEngine(final byte constantGCP,final DragstrParams dragstrParams,final PairHMMNativeArguments arguments,final PairHMM.Implementation hmmType,final GATKPath resultsFile,final double log10globalReadMismappingRate,final PairHMMLikelihoodCalculationEngine.PCRErrorModel pcrErrorModel,final byte baseQualityScoreThreshold,final boolean dynamicReadDisqualificaiton,final double readDisqualificationScale,final double expectedErrorRatePerBase,final boolean symmetricallyNormalizeAllelesToReference,final boolean disableCapReadQualitiesToMapQ,final boolean modifySoftclippedBases,final int rangeForReadOverlapToDeterminedBases) {Utils.nonNull(hmmType, "hmmType is null");Utils.nonNull(pcrErrorModel, "pcrErrorModel is null");if (constantGCP < 0){throw new IllegalArgumentException("gap continuation penalty must be non-negative");}if (log10globalReadMismappingRate > 0){throw new IllegalArgumentException("log10globalReadMismappingRate must be negative");}this.dragstrParams = dragstrParams;this.constantGCP = constantGCP;this.log10globalReadMismappingRate = log10globalReadMismappingRate;this.pcrErrorModel = this.dragstrParams == null ? pcrErrorModel : PairHMMLikelihoodCalculationEngine.PCRErrorModel.NONE;// TODO later we probably need a LOG and LOGLESS version for parsimony with DRAGENthis.pdPairHMM = new LoglessPDPairHMM();if (resultsFile != null) {pdPairHMM.setAndInitializeDebugOutputStream(new OutputStreamWriter(resultsFile.getOutputStream()));}this.dynamicDisqualification = dynamicReadDisqualificaiton;this.readDisqualificationScale = readDisqualificationScale;this.symmetricallyNormalizeAllelesToReference = symmetricallyNormalizeAllelesToReference;this.expectedErrorRatePerBase = expectedErrorRatePerBase;this.disableCapReadQualitiesToMapQ = disableCapReadQualitiesToMapQ;this.modifySoftclippedBases = modifySoftclippedBases;this.rangeForReadOverlapToDeterminedBases = rangeForReadOverlapToDeterminedBases;initializePCRErrorModel();if (baseQualityScoreThreshold < QualityUtils.MIN_USABLE_Q_SCORE) {throw new IllegalArgumentException("baseQualityScoreThreshold must be greater than or equal to " + QualityUtils.MIN_USABLE_Q_SCORE + " (QualityUtils.MIN_USABLE_Q_SCORE)");}this.baseQualityScoreThreshold = baseQualityScoreThreshold;}@Overridepublic void close() {pdPairHMM.close();}@Overridepublic void filterPoorlyModeledEvidence(AlleleLikelihoods<GATKRead, ? extends Haplotype> result, boolean dynamicDisqualification, double expectedErrorRatePerBase, double readDisqualificationScale) {ReadLikelihoodCalculationEngine.super.filterPoorlyModeledEvidence(result, dynamicDisqualification, expectedErrorRatePerBase, readDisqualificationScale);}@Overridepublic ToDoubleFunction<GATKRead> log10MinTrueLikelihood(double maximumErrorPerBase, boolean capLikelihoods) {return ReadLikelihoodCalculationEngine.super.log10MinTrueLikelihood(maximumErrorPerBase, capLikelihoods);}@Overridepublic AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(List<Haplotype> haplotypeList, SAMFileHeader hdr, SampleList samples, Map<String, List<GATKRead>> perSampleReadList, boolean filterPoorly) {throw new GATKException.ShouldNeverReachHereException("We should never get here, this HMM engine exists only for PD haplotype computation");}@Override@SuppressWarnings("unchecked")// NOTE that the PairHMM doesn't need to interrogate the header so we skip checking it for this versionpublic AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(AssemblyResultSet assemblyResultSet, SampleList samples,Map<String, List<GATKRead>> perSampleReadList,boolean filterPoorly) {Utils.nonNull(assemblyResultSet, "assemblyResultSet is null");if (assemblyResultSet.getHaplotypeList().stream().anyMatch(haplotype -> !haplotype.isPartiallyDetermined())) {throw new GATKException("PDHMM engine requires PartiallyDeterminedHaplotype objects as input");}final List<PartiallyDeterminedHaplotype> haplotypeList = assemblyResultSet.getHaplotypeList().stream().map(h -> (PartiallyDeterminedHaplotype)h).collect(Collectors.toList());AlleleLikelihoods<GATKRead, Haplotype> resut = (AlleleLikelihoods<GATKRead, Haplotype>) computeReadLikelihoodsPartiallyDetermined(haplotypeList, null, samples, perSampleReadList, filterPoorly);return resut;}public AlleleLikelihoods<GATKRead, ? extends Haplotype> computeReadLikelihoodsPartiallyDetermined(final List<PartiallyDeterminedHaplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList, final boolean filterPoorly) {Utils.nonNull(samples, "samples is null");Utils.nonNull(perSampleReadList, "perSampleReadList is null");Utils.nonNull(haplotypeList, "haplotypeList is null");final AlleleList<PartiallyDeterminedHaplotype> haplotypes = new IndexedAlleleList<>(haplotypeList);initializePairHMM(haplotypeList, perSampleReadList);// Add likelihoods for each sample's reads to our resultfinal AlleleLikelihoods<GATKRead, PartiallyDeterminedHaplotype> result = new AlleleLikelihoods<>(samples, haplotypes, perSampleReadList);final int sampleCount = result.numberOfSamples();for (int i = 0; i < sampleCount; i++) {computeReadLikelihoods(result.sampleMatrix(i));}result.normalizeLikelihoods(log10globalReadMismappingRate, symmetricallyNormalizeAllelesToReference);filterPoorlyModeledEvidence(result, dynamicDisqualification, expectedErrorRatePerBase, readDisqualificationScale);return result;}/*** Creates a new GATKRead with the source read's header, read group and mate* information, but with the following fields set to user-supplied values:* - Read Bases* - Base Qualities* - Base Insertion Qualities* - Base Deletion Qualities** Cigar string is empty (not-null)** Use this method if you want to create a new GATKRead based on* another GATKRead, but with modified bases and qualities** @param read a read to copy the header from* @param readBases an array containing the new bases you wish use in place of the originals* @param baseQualities an array containing the new base qualities you wish use in place of the originals* @param baseInsertionQualities an array containing the new base insertion qaulities* @param baseDeletionQualities an array containing the new base deletion qualities* @return a read with modified bases and qualities, safe for the GATK*/private static GATKRead createQualityModifiedRead(final GATKRead read,final byte[] readBases,final byte[] baseQualities,final byte[] baseInsertionQualities,final byte[] baseDeletionQualities) {Utils.validateArg( baseQualities.length == readBases.length && baseInsertionQualities.length == readBases.length && baseDeletionQualities.length == readBases.length,() -> String.format("Read bases and read quality arrays aren't the same size: Bases: %d vs Base Q's: %d vs Insert Q's: %d vs Delete Q's: %d.",readBases.length, baseQualities.length, baseInsertionQualities.length, baseDeletionQualities.length));final GATKRead processedRead = ReadUtils.emptyRead(read);processedRead.setBases(readBases);processedRead.setBaseQualities(baseQualities);ReadUtils.setInsertionBaseQualities(processedRead, baseInsertionQualities);ReadUtils.setDeletionBaseQualities(processedRead, baseDeletionQualities);return processedRead;}/*** Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate** After calling this routine the PairHMM will be configured to best evaluate all reads in the samples* against the set of haplotypes** @param haplotypes a non-null list of haplotypes* @param perSampleReadList a mapping from sample -> reads*/private void initializePairHMM(final List<PartiallyDeterminedHaplotype> haplotypes, final Map<String, List<GATKRead>> perSampleReadList) {final int readMaxLength = perSampleReadList.entrySet().stream().flatMap(e -> e.getValue().stream()).mapToInt(GATKRead::getLength).max().orElse(0);final int haplotypeMaxLength = haplotypes.stream().mapToInt(h -> h.getBases().length).max().orElse(0);// initialize arrays to hold the probabilities of being in the match, insertion and deletion casespdPairHMM.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);}private void computeReadLikelihoods(final LikelihoodMatrix<GATKRead, PartiallyDeterminedHaplotype> likelihoods) {// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualitiesfinal List<GATKRead> processedReads = modifyReadQualities(likelihoods.evidence());if (HaplotypeCallerGenotypingDebugger.isEnabled()) {for(int counter = 0; counter < processedReads.size(); counter++) {GATKRead read = processedReads.get(counter);HaplotypeCallerGenotypingDebugger.println("Range for Overlaps to Variants for consideration: "+rangeForReadOverlapToDeterminedBases);HaplotypeCallerGenotypingDebugger.println("read "+counter +": "+read.getName()+" cigar: "+read.getCigar()+" mapQ: "+read.getMappingQuality()+" loc: ["+read.getStart() +"-"+ read.getEnd()+"] unclippedloc: ["+read.getUnclippedStart()+"-"+read.getUnclippedEnd()+"]");HaplotypeCallerGenotypingDebugger.println(Arrays.toString(read.getBaseQualitiesNoCopy()));}}// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotypepdPairHMM.computeLog10Likelihoods(likelihoods, processedReads, inputScoreImputator, rangeForReadOverlapToDeterminedBases);}/*** Pre-processing of the reads to be evaluated at the current location from the current sample.* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.* Modified copies of reads are packed into a new list, while original reads are retained for downstream use** @param reads The original list of unmodified reads* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding*/private List<GATKRead> modifyReadQualities(final List<GATKRead> reads) {final List<GATKRead> result = new ArrayList<>(reads.size());for (final GATKRead read : reads) {final GATKRead maybeUnclipped = modifySoftclippedBases ? read : ReadClipper.hardClipSoftClippedBases(read); //Clip the bases here to remove hapfinal byte[] readBases = maybeUnclipped.getBases();//For FRD we want to have scores for reads that don't overlapfinal SimpleInterval readSpan = new SimpleInterval(read.getContig(), read.getUnclippedStart(), read.getUnclippedEnd());// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read//Using close here is justified - it's an array of primitives.final byte[] readQuals = maybeUnclipped.getBaseQualities().clone();final byte[] readInsQuals = ReadUtils.getBaseInsertionQualities(maybeUnclipped).clone();final byte[] readDelQuals = ReadUtils.getBaseDeletionQualities(maybeUnclipped).clone();applyPCRErrorModel(readBases, readInsQuals, readDelQuals);capMinimumReadQualities(maybeUnclipped, readQuals, readInsQuals, readDelQuals, baseQualityScoreThreshold, disableCapReadQualitiesToMapQ);// Store the actual qualitiesread.setTransientAttribute(HMM_BASE_QUALITIES_TAG, readQuals);// Create a new copy of the read and sets its base qualities to the modified versions.GATKRead qualityModifiedRead = createQualityModifiedRead(maybeUnclipped, readBases, readQuals, readInsQuals, readDelQuals);qualityModifiedRead.setTransientAttribute(UNCLIPPED_ORIGINAL_SPAN_ATTR, readSpan);result.add(qualityModifiedRead);}return result;}private static void capMinimumReadQualities(final GATKRead read, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, final byte baseQualityScoreThreshold, final boolean disableCapReadQualitiesToMapQ) {for( int i = 0; i < readQuals.length; i++ ) {if (!disableCapReadQualitiesToMapQ) {readQuals[i] = (byte) Math.min(0xff & readQuals[i], read.getMappingQuality()); // cap base quality by mapping quality, as in UG}readQuals[i] = setToFixedValueIfTooLow( readQuals[i], baseQualityScoreThreshold, QualityUtils.MIN_USABLE_Q_SCORE );readInsQuals[i] = setToFixedValueIfTooLow( readInsQuals[i], QualityUtils.MIN_USABLE_Q_SCORE, QualityUtils.MIN_USABLE_Q_SCORE );readDelQuals[i] = setToFixedValueIfTooLow( readDelQuals[i], QualityUtils.MIN_USABLE_Q_SCORE, QualityUtils.MIN_USABLE_Q_SCORE );}}private static byte setToFixedValueIfTooLow(final byte currentVal, final byte minQual, final byte fixedQual){return currentVal < minQual ? fixedQual : currentVal;}private static Map<GATKRead, byte[]> buildGapContinuationPenalties(final List<GATKRead> reads, final byte gapPenalty) {final Map<GATKRead, byte[]> result = new HashMap<>(reads.size());reads.stream().forEach(read -> result.put(read, Utils.dupBytes(gapPenalty, read.getLength())));return result;}/* --------------------------------------------------------------------------------** Experimental attempts at PCR error rate modeling*-------------------------------------------------------------------------------- */private byte[] pcrIndelErrorModelCache;private PairHMMInputScoreImputator inputScoreImputator;private void initializePCRErrorModel() {inputScoreImputator = dragstrParams == null? StandardPairHMMInputScoreImputator.newInstance(constantGCP): DragstrPairHMMInputScoreImputator.of(dragstrParams) ;if ( !pcrErrorModel.hasRateFactor() ) {return;}pcrIndelErrorModelCache = new byte[ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH + 1];final double rateFactor = pcrErrorModel.getRateFactor();for(int i = 0; i <= ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH; i++ ) {pcrIndelErrorModelCache[i] = getErrorModelAdjustedQual(i, rateFactor);}}static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound(INITIAL_QSCORE - Math.exp(repeatLength / (rateFactor * Math.PI)) + 1.0));}@VisibleForTestingvoid applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {if ( pcrErrorModel == PairHMMLikelihoodCalculationEngine.PCRErrorModel.NONE ) {return;}for ( int i = 1; i < readBases.length; i++ ) {final int repeatLength = ReadLikelihoodCalculationEngine.findTandemRepeatUnits(readBases, i-1).getRight();readInsQuals[i-1] = (byte) Math.min(0xff & readInsQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);readDelQuals[i-1] = (byte) Math.min(0xff & readDelQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);}}}
PairHMMLikelihoodCalculationEngine源码
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.pairhmm.PairHMMNativeArguments;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.genotyper.*;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMMInputScoreImputator;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;import java.io.OutputStreamWriter;
import java.util.*;/** Classic likelihood computation: full pair-hmm all haplotypes vs all reads.*/
public final class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine {public static final double DEFAULT_DYNAMIC_DISQUALIFICATION_SCALE_FACTOR = 1.0;private static final Logger logger = LogManager.getLogger(PairHMMLikelihoodCalculationEngine.class);private static final int MIN_ADJUSTED_QSCORE = 10;@VisibleForTestingstatic final double INITIAL_QSCORE = 40.0;public static final String HMM_BASE_QUALITIES_TAG = "HMMQuals";private final byte constantGCP;private final double log10globalReadMismappingRate;private final PairHMM pairHMM;// DRAGEN-GATK related parametersprivate final DragstrParams dragstrParams;private final boolean dynamicDisqualification;private final double readDisqualificationScale;private final double expectedErrorRatePerBase;private final boolean disableCapReadQualitiesToMapQ;private final boolean symmetricallyNormalizeAllelesToReference;private final boolean modifySoftclippedBases;public enum PCRErrorModel {/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */NONE(0.0),/** a most aggressive model will be applied that sacrifices true positives in order to remove more false positives */HOSTILE(1.0),/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */AGGRESSIVE(2.0),/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */CONSERVATIVE(3.0);private final double rateFactor;/** rate factor is applied to the PCR error model. Can be 0.0 to imply no correction */PCRErrorModel(final double rateFactor) {this.rateFactor = rateFactor;}public double getRateFactor() { return rateFactor; }public boolean hasRateFactor() { return rateFactor != 0.0; }}private final PCRErrorModel pcrErrorModel;private final byte baseQualityScoreThreshold;/*** Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations** @param constantGCP the gap continuation penalty to use with the PairHMM* @param hmmType the type of the HMM to use* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of* -3 means that the chance that a read doesn't actually belong at this* location in the genome is 1 in 1000. The effect of this parameter is* to cap the maximum likelihood difference between the reference haplotype* and the best alternative haplotype by -3 log units. So if the best* haplotype is at -10 and this parameter has a value of -3 then even if the* reference haplotype gets a score of -100 from the pairhmm it will be* assigned a likelihood of -13.* @param pcrErrorModel model to correct for PCR indel artifacts*/public PairHMMLikelihoodCalculationEngine(final byte constantGCP,final DragstrParams dragstrParams,final PairHMMNativeArguments arguments,final PairHMM.Implementation hmmType,final double log10globalReadMismappingRate,final PCRErrorModel pcrErrorModel) {this( constantGCP, dragstrParams, arguments, hmmType, null, log10globalReadMismappingRate, pcrErrorModel, PairHMM.BASE_QUALITY_SCORE_THRESHOLD, false, DEFAULT_DYNAMIC_DISQUALIFICATION_SCALE_FACTOR, ReadLikelihoodCalculationEngine.DEFAULT_EXPECTED_ERROR_RATE_PER_BASE, true, false, true);}/*** Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations** @param constantGCP the gap continuation penalty to use with the PairHMM* @param hmmType the type of the HMM to use* @param resultsFile output file to dump per-read, per-haplotype inputs and outputs for debugging purposes (null if not enabled).* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of* -3 means that the chance that a read doesn't actually belong at this* location in the genome is 1 in 1000. The effect of this parameter is* to cap the maximum likelihood difference between the reference haplotype* and the best alternative haplotype by -3 log units. So if the best* haplotype is at -10 and this parameter has a value of -3 then even if the* reference haplotype gets a score of -100 from the pairhmm it will be* assigned a likelihood of -13.* @param pcrErrorModel model to correct for PCR indel artifacts* @param baseQualityScoreThreshold Base qualities below this threshold will be reduced to the minimum usable base* quality.*/public PairHMMLikelihoodCalculationEngine(final byte constantGCP,final DragstrParams dragstrParams,final PairHMMNativeArguments arguments,final PairHMM.Implementation hmmType,final GATKPath resultsFile,final double log10globalReadMismappingRate,final PCRErrorModel pcrErrorModel,final byte baseQualityScoreThreshold,final boolean dynamicReadDisqualificaiton,final double readDisqualificationScale,final double expectedErrorRatePerBase,final boolean symmetricallyNormalizeAllelesToReference,final boolean disableCapReadQualitiesToMapQ,final boolean modifySoftclippedBases) {Utils.nonNull(hmmType, "hmmType is null");Utils.nonNull(pcrErrorModel, "pcrErrorModel is null");if (constantGCP < 0){throw new IllegalArgumentException("gap continuation penalty must be non-negative");}if (log10globalReadMismappingRate > 0){throw new IllegalArgumentException("log10globalReadMismappingRate must be negative");}this.dragstrParams = dragstrParams;this.constantGCP = constantGCP;this.log10globalReadMismappingRate = log10globalReadMismappingRate;this.pcrErrorModel = this.dragstrParams == null ? pcrErrorModel : PCRErrorModel.NONE;this.pairHMM = hmmType.makeNewHMM(arguments);if (resultsFile != null) {pairHMM.setAndInitializeDebugOutputStream(new OutputStreamWriter(resultsFile.getOutputStream()));}this.dynamicDisqualification = dynamicReadDisqualificaiton;this.readDisqualificationScale = readDisqualificationScale;this.symmetricallyNormalizeAllelesToReference = symmetricallyNormalizeAllelesToReference;this.expectedErrorRatePerBase = expectedErrorRatePerBase;this.disableCapReadQualitiesToMapQ = disableCapReadQualitiesToMapQ;this.modifySoftclippedBases = modifySoftclippedBases;initializePCRErrorModel();if (baseQualityScoreThreshold < QualityUtils.MIN_USABLE_Q_SCORE) {throw new IllegalArgumentException("baseQualityScoreThreshold must be greater than or equal to " + QualityUtils.MIN_USABLE_Q_SCORE + " (QualityUtils.MIN_USABLE_Q_SCORE)");}this.baseQualityScoreThreshold = baseQualityScoreThreshold;}@Overridepublic void close() {pairHMM.close();}@Override// NOTE that the PairHMM doesn't need to interrogate the header so we skip checking it for this versionpublic AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(AssemblyResultSet assemblyResultSet, SampleList samples,Map<String, List<GATKRead>> perSampleReadList,boolean filterPoorly) {Utils.nonNull(assemblyResultSet, "assemblyResultSet is null");final List<Haplotype> haplotypeList = assemblyResultSet.getHaplotypeList();return computeReadLikelihoods(haplotypeList, null, samples, perSampleReadList, filterPoorly);}@Overridepublic AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods( final List<Haplotype> haplotypeList,final SAMFileHeader hdr,final SampleList samples,final Map<String, List<GATKRead>> perSampleReadList, final boolean filterPoorly) {Utils.nonNull(samples, "samples is null");Utils.nonNull(perSampleReadList, "perSampleReadList is null");Utils.nonNull(haplotypeList, "haplotypeList is null");final AlleleList<Haplotype> haplotypes = new IndexedAlleleList<>(haplotypeList);initializePairHMM(haplotypeList, perSampleReadList);// Add likelihoods for each sample's reads to our resultfinal AlleleLikelihoods<GATKRead, Haplotype> result = new AlleleLikelihoods<>(samples, haplotypes, perSampleReadList);final int sampleCount = result.numberOfSamples();for (int i = 0; i < sampleCount; i++) {computeReadLikelihoods(result.sampleMatrix(i));}result.normalizeLikelihoods(log10globalReadMismappingRate, symmetricallyNormalizeAllelesToReference);filterPoorlyModeledEvidence(result, dynamicDisqualification, expectedErrorRatePerBase, readDisqualificationScale);return result;}/*** Creates a new GATKRead with the source read's header, read group and mate* information, but with the following fields set to user-supplied values:* - Read Bases* - Base Qualities* - Base Insertion Qualities* - Base Deletion Qualities** Cigar string is empty (not-null)** Use this method if you want to create a new GATKRead based on* another GATKRead, but with modified bases and qualities** @param read a read to copy the header from* @param readBases an array containing the new bases you wish use in place of the originals* @param baseQualities an array containing the new base qualities you wish use in place of the originals* @param baseInsertionQualities an array containing the new base insertion qaulities* @param baseDeletionQualities an array containing the new base deletion qualities* @return a read with modified bases and qualities, safe for the GATK*/private static GATKRead createQualityModifiedRead(final GATKRead read,final byte[] readBases,final byte[] baseQualities,final byte[] baseInsertionQualities,final byte[] baseDeletionQualities) {Utils.validateArg( baseQualities.length == readBases.length && baseInsertionQualities.length == readBases.length && baseDeletionQualities.length == readBases.length,() -> String.format("Read bases and read quality arrays aren't the same size: Bases: %d vs Base Q's: %d vs Insert Q's: %d vs Delete Q's: %d.",readBases.length, baseQualities.length, baseInsertionQualities.length, baseDeletionQualities.length));final GATKRead processedRead = ReadUtils.emptyRead(read);processedRead.setBases(readBases);processedRead.setBaseQualities(baseQualities);ReadUtils.setInsertionBaseQualities(processedRead, baseInsertionQualities);ReadUtils.setDeletionBaseQualities(processedRead, baseDeletionQualities);return processedRead;}/*** Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate** After calling this routine the PairHMM will be configured to best evaluate all reads in the samples* against the set of haplotypes** @param haplotypes a non-null list of haplotypes* @param perSampleReadList a mapping from sample -> reads*/private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKRead>> perSampleReadList) {final int readMaxLength = perSampleReadList.entrySet().stream().flatMap(e -> e.getValue().stream()).mapToInt(GATKRead::getLength).max().orElse(0);final int haplotypeMaxLength = haplotypes.stream().mapToInt(h -> h.getBases().length).max().orElse(0);// initialize arrays to hold the probabilities of being in the match, insertion and deletion casespairHMM.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);}private void computeReadLikelihoods(final LikelihoodMatrix<GATKRead, Haplotype> likelihoods) {// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualitiesfinal List<GATKRead> processedReads = modifyReadQualities(likelihoods.evidence());for(int counter = 0; counter < processedReads.size(); counter++) {GATKRead read = processedReads.get(counter);if (HaplotypeCallerGenotypingDebugger.isEnabled()) {HaplotypeCallerGenotypingDebugger.println("read "+counter +": "+read.getName()+" cigar: "+read.getCigar()+" mapQ: "+read.getMappingQuality()+" loc: ["+read.getStart() +"-"+ read.getEnd()+"] unclippedloc: ["+read.getUnclippedStart()+"-"+read.getUnclippedEnd()+"]");HaplotypeCallerGenotypingDebugger.println(Arrays.toString(read.getBaseQualitiesNoCopy()));}}// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotypepairHMM.computeLog10Likelihoods(likelihoods, processedReads, inputScoreImputator);}/*** Pre-processing of the reads to be evaluated at the current location from the current sample.* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.* Modified copies of reads are packed into a new list, while original reads are retained for downstream use** @param reads The original list of unmodified reads* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding*/private List<GATKRead> modifyReadQualities(final List<GATKRead> reads) {final List<GATKRead> result = new ArrayList<>(reads.size());for (final GATKRead read : reads) {final GATKRead maybeUnclipped = modifySoftclippedBases ? read : ReadClipper.hardClipSoftClippedBases(read); //Clip the bases here to remove hapfinal byte[] readBases = maybeUnclipped.getBases();// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read//Using close here is justified - it's an array of primitives.final byte[] readQuals = maybeUnclipped.getBaseQualities().clone();final byte[] readInsQuals = ReadUtils.getBaseInsertionQualities(maybeUnclipped).clone();final byte[] readDelQuals = ReadUtils.getBaseDeletionQualities(maybeUnclipped).clone();applyPCRErrorModel(readBases, readInsQuals, readDelQuals);capMinimumReadQualities(maybeUnclipped, readQuals, readInsQuals, readDelQuals, baseQualityScoreThreshold, disableCapReadQualitiesToMapQ);// Store the actual qualitiesread.setTransientAttribute(HMM_BASE_QUALITIES_TAG, readQuals);// Create a new copy of the read and sets its base qualities to the modified versions.result.add(createQualityModifiedRead(maybeUnclipped, readBases, readQuals, readInsQuals, readDelQuals));}return result;}private static void capMinimumReadQualities(final GATKRead read, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, final byte baseQualityScoreThreshold, final boolean disableCapReadQualitiesToMapQ) {for( int i = 0; i < readQuals.length; i++ ) {if (!disableCapReadQualitiesToMapQ) {readQuals[i] = (byte) Math.min(0xff & readQuals[i], read.getMappingQuality()); // cap base quality by mapping quality, as in UG}readQuals[i] = setToFixedValueIfTooLow( readQuals[i], baseQualityScoreThreshold, QualityUtils.MIN_USABLE_Q_SCORE );readInsQuals[i] = setToFixedValueIfTooLow( readInsQuals[i], QualityUtils.MIN_USABLE_Q_SCORE, QualityUtils.MIN_USABLE_Q_SCORE );readDelQuals[i] = setToFixedValueIfTooLow( readDelQuals[i], QualityUtils.MIN_USABLE_Q_SCORE, QualityUtils.MIN_USABLE_Q_SCORE );}}private static byte setToFixedValueIfTooLow(final byte currentVal, final byte minQual, final byte fixedQual){return currentVal < minQual ? fixedQual : currentVal;}private static Map<GATKRead, byte[]> buildGapContinuationPenalties(final List<GATKRead> reads, final byte gapPenalty) {final Map<GATKRead, byte[]> result = new HashMap<>(reads.size());reads.stream().forEach(read -> result.put(read, Utils.dupBytes(gapPenalty, read.getLength())));return result;}/* --------------------------------------------------------------------------------** Experimental attempts at PCR error rate modeling*-------------------------------------------------------------------------------- */private byte[] pcrIndelErrorModelCache;private PairHMMInputScoreImputator inputScoreImputator;private void initializePCRErrorModel() {inputScoreImputator = dragstrParams == null? StandardPairHMMInputScoreImputator.newInstance(constantGCP): DragstrPairHMMInputScoreImputator.of(dragstrParams) ;if ( !pcrErrorModel.hasRateFactor() ) {return;}pcrIndelErrorModelCache = new byte[ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH + 1];final double rateFactor = pcrErrorModel.getRateFactor();for(int i = 0; i <= ReadLikelihoodCalculationEngine.MAX_REPEAT_LENGTH; i++ ) {pcrIndelErrorModelCache[i] = getErrorModelAdjustedQual(i, rateFactor);}}static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound(INITIAL_QSCORE - Math.exp(repeatLength / (rateFactor * Math.PI)) + 1.0));}@VisibleForTestingvoid applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {if ( pcrErrorModel == PCRErrorModel.NONE ) {return;}for ( int i = 1; i < readBases.length; i++ ) {final int repeatLength = ReadLikelihoodCalculationEngine.findTandemRepeatUnits(readBases, i-1).getRight();readInsQuals[i-1] = (byte) Math.min(0xff & readInsQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);readDelQuals[i-1] = (byte) Math.min(0xff & readDelQuals[i - 1], 0xff & pcrIndelErrorModelCache[repeatLength]);}}}