@TechReport{	  it:2022-005,
  author	= {Camille Clouard and Carl Nettelblad},
  title		= {Consistency Study of a Reconstructed Genotype Probability
		  Distribution via Clustered Bootstrapping in {NORB} Pooling
		  Blocks},
  institution	= {Department of Information Technology, Uppsala University},
  department	= {Division of Scientific Computing},
  year		= {2022},
  number	= {2022-005},
  month		= jun,
  abstract	= {For applications with biallelic genetic markers, group
		  testing techniques, synonymous to pooling techniques, are
		  usually applied for decreasing the cost of large-scale
		  testing as e.g. when detecting carriers of rare genetic
		  variants. In some configurations, the results of the
		  grouped tests cannot be decoded and the pooled items are
		  missing. Inference of these missing items can be performed
		  with specific statistical methods that are for example
		  related to the Expectation-Maximization algorithm. Pooling
		  has also been applied for determining the genotype of
		  markers in large populations. The particularity of full
		  genotype data for diploid organisms in the context of group
		  testing are the ternary outcomes (two homozygous genotypes
		  and one heterozygous), as well as the distribution of these
		  three outcomes in a population, which is often ruled by the
		  Hardy-Weinberg Equilibrium and depends on the allele
		  frequency in such situation. When using a nonoverlapping
		  repeated block pooling design, the missing items are only
		  observed in particular arrangements. Overall, a data set of
		  pooled genotypes can be described as an inference problem
		  in Missing Not At Random data with nonmonotone missingness
		  patterns. This study presents a preliminary investigation
		  of the consistency of various iterative methods estimating
		  the most likely genotype probabilities of the missing items
		  in pooled data. We use the Kullback-Leibler divergence and
		  the L2 distance between the genotype distribution computed
		  from our estimates and a simulated empirical distribution
		  as a measure of the distributional consistency. }
}