{PEDANT Version 1.0 (26 January 2007) Paul Johnson Robertson Centre for Biostatistics Boyd Orr Building University of Glasgow University Avenue Glasgow G12 8QQ UK paulj@stats.gla.ac.uk http://www.stats.gla.ac.uk/~paulj/index.html PEDANT is program for estimating maximum likelihood allelic dropout and false allele error rates from duplicate genotypes in the absence of reference genotypes. It is written in Delphi (which is based on Turbo Pascal). The main function of PEDANT is to estimate error rates with confidence intervals from real genotypic data. In addition, PEDANT can perform simulations, allowing you to decide how many samples you need to replicate to get the level of accuracy you want in your error rate estimates. If you download PEDANT, please let me know by email at p.johnson@bio.gla.ac.uk so that I can keep you informed of bug fixes and updates. Also let me know any comments, suggestions, bugs, etc. IMPORTANT NOTE: This file contains the bulk of what's in Pedant, including all the equations and calculation machinery. To compile and run Pedant within Delphi you'll need two other files that, among other functions, specify properties of the Windows interface. Please email me if you would like these files.} unit pedant250805; interface uses Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, Dialogs, Menus, StdCtrls, Math, TeEngine, Series, ExtCtrls, TeeProcs, Chart, ComCtrls; const see_points=1; {if =0 then individual estimates will not be plotted on Multi Sim plot and method comparison plot} variable_sample_quality=0; {if =1 then the error rates will be varied across the data set during simulation} see_test_stats=0; {if =1 then bias & variance of ML estimates will be output, as will the % times the pop parameters fall within 95% CI of estimates} type TtrialForm1 = class(TForm) MainMenu1: TMainMenu; File1: TMenuItem; simbutton: TButton; multisimbutton: TButton; OpenDialog1: TOpenDialog; Open1: TMenuItem; Exit1: TMenuItem; getconfidence: TButton; PageControl1: TPageControl; TabSheet1: TTabSheet; TabSheet2: TTabSheet; Chart1: TChart; Series1: TPointSeries; Series2: TPointSeries; TabSheet3: TTabSheet; Chart2: TChart; TabSheet4: TTabSheet; Output1: TMemo; Summary: TMemo; Series3: TPointSeries; TabSheet6: TTabSheet; Chart3: TChart; Series4: TPointSeries; Series5: TPointSeries; Series6: TPointSeries; Series7: TPointSeries; Series8: TPointSeries; Series9: TLineSeries; Series10: TLineSeries; Series11: TLineSeries; Series12: TLineSeries; Series13: TPointSeries; Series14: TPointSeries; Series15: TPointSeries; Series16: TPointSeries; Series17: TPointSeries; Series18: TPointSeries; Series19: TPointSeries; Series20: TPointSeries; Series21: TLineSeries; Series22: TLineSeries; Series23: TPointSeries; TabSheet5: TTabSheet; Chart4: TChart; Chart5: TChart; Series24: TLineSeries; Series27: TLineSeries; Series28: TPointSeries; Series29: TPointSeries; Label1: TLabel; Series30: TPointSeries; Series31: TPointSeries; Series32: TLineSeries; Series33: TLineSeries; Series34: TPointSeries; Series25: TLineSeries; Series26: TLineSeries; TabSheet7: TTabSheet; Chart6: TChart; Series35: TPointSeries; Series36: TPointSeries; Series37: TPointSeries; Series38: TLineSeries; Series39: TLineSeries; Series40: TPointSeries; Series41: TPointSeries; Series42: TPointSeries; Series43: TPointSeries; Series44: TLineSeries; Series45: TLineSeries; Series46: TPointSeries; Series47: TPointSeries; OpenHfile1: TMenuItem; TabSheet8: TTabSheet; HeOutput: TMemo; Label2: TLabel; Series48: TPointSeries; Series49: TPointSeries; procedure simdata; {simulates data} procedure simbuttonClick(Sender: TObject); {run simdata} procedure getlikelihood; {calculate likelihood of any E1,E2} procedure maxlikelihood; {calculates maximum likelihood E1,E2 for a particular sim or real data set} procedure multisimbuttonClick(Sender: TObject); {run multisim} procedure multisim; {run mutiple simulations and plot mean and std dev error rates} procedure read_data; {input repeat genotype data from file} procedure get_data; {estimate error rates from repeat genotype data} procedure Exit1Click(Sender: TObject); {close program} procedure Open1Click(Sender: TObject); {open repeat genotype data file} procedure getconfidenceClick(Sender: TObject); {run confidence limits} procedure confidence_limits; {plot confidence limits for error estimates} procedure get_parameters; {input parameters for simulations} procedure OpenHfile1Click(Sender: TObject); {open genotype data file for He estimation} procedure read_He_data; {input genotype data from file for He estimation} procedure analyse_He_data; {estimate He and std dev from genotype data} private { Private declarations } public { Public declarations } end; integer_vector=array[1..100] of integer; real_vector=array[1..100] of real; extended_vector=array[1..100] of extended; var trialForm1: TtrialForm1; data_cat, sim_cat, sample_error: integer_vector; exp_freq, exp_cat: extended_vector; d0, d1, o0, o1, o2: Real; i, tot_sim_obs, tot_data_obs, noof_samples, noof_whole_sample: integer; E1,E2: real; exp_freq_obs: extended; drop_per_het, false_per_pcr: real; sim_or_real: integer; infile1,infile2:text; He, sim_He, true_He:real; error1, error2: real; testarray:array[1..2000,1..101] of integer; He_array:array[1..4000,1..50] of integer; data_He:array[1..25] of real; data_He_stddev: real_vector; sum_error1, sum_error2, sum_ref_error1, sum_ref_error2, sum_per_het_error1, sum_per_pcr_error2, sum_sqrs_error1, sum_sqrs_error2, sum_sqrs_ref_error1, sum_sqrs_ref_error2, sum_sqrs_per_het_error1, sum_sqrs_per_pcr_error2: real; est_drop_per_het, est_false_per_pcr, true_drop_per_het, true_false_per_pcr: real; see_genotypes, see_categories, see_cat_summary, see_exp_freq, see_exp_num, see_walk, see_walk_graph, see_sim_cat, see_data_cat, see_ml_vs_ref, see_ml_vs_ref_graph, fast_settings: integer; sample_error1_per_allele, sample_error2_per_allele, mean_sample_error1_per_allele, mean_sample_error2_per_allele, sum_sample_error1_per_allele, sum_sample_error2_per_allele, sum_sqrs_sample_error1_per_allele, sum_sqrs_sample_error2_per_allele: real; drop_dom, ref_data_fraction, Fis: real; noof_steps: integer; sample_drop_per_het, sample_false_per_pcr: real; simple_error1, simple_error2, simple_error1_per_het, simple_error1_per_pcr, simple_error2_per_pcr, sum_simple_error1, sum_simple_error2, sum_sample_error1, sum_sample_error2, sum_sqrs_simple_error1, sum_sqrs_simple_error2, sum_sqrs_sample_error1, sum_sqrs_sample_error2: real; sum_pop_dev_error1, sum_pop_dev_error2, sum_sample_dev_error1, sum_sample_dev_error2, sum_sqrs_pop_dev_error1, sum_sqrs_pop_dev_error2, sum_sqrs_sample_dev_error1, sum_sqrs_sample_dev_error2: real; single_locus, which_locus, noof_loci: integer; noof_He_loci, using_He_limits: integer; sim_noof_He_samples: integer; sim_He_stddev: real; sample_inside_90CI, pop_inside_90CI, sample_inside_95CI, pop_inside_95CI: integer; ml_wins_E1, ref_wins_E1, draw_E1, ml_wins_E2, ref_wins_E2, draw_E2: integer; sum_sqrs_ref_data_drop, sum_sqrs_ML_drop, sum_sqrs_ref_data_false, sum_sqrs_ML_false: real; implementation {$R *.dfm} function Fact(n: variant): extended; {Calculates factorials exactly from 0-12 and by Stirling's approximation above 12} var f: LongInt; i: Integer; begin if n > 12 then Result := SQRT(2*pi*n)*power(n,n)*EXP(-1*n+(1/(12*n))) {Stirling's approximation of n!} else begin f := 1; for i := 2 to n do f := f * i; Result := f; end; end; function log_likelihood(E1, E2: real): extended; {Calculates likelihood of any E1,E2 given a data set} var tot_obs: integer; cat: integer_vector; begin if sim_or_real = 0 then cat := sim_cat else cat := data_cat; tot_obs := cat[1]+cat[2]+cat[3]+cat[4]+cat[5]+cat[6]+cat[7]; d0 := (1-E1)*(1-E1); {P(no dropouts in a single genotype)} d1 := E1*(1-E1); {P(1 dropout in a single genotype)} o0 := (1-E2)*(1-E2); {P(no false alleles in a single genotype)} o1 := E2*(1-E2); {P(1 false allele in a single genotype)} o2 := E2*E2; {P(2 false alleles in a single genotype)} {exp_freq_obs is the sum of the expected frequencies of categories 1-7. It is used to adjust the 7 expected frequencies to sum to one} exp_freq_obs := ((1-He)* (d0*d0*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1)+He* (d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1)+He* (d0*d0*o0*o0)+ (1-He)* (d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1)+He* (d1*d0*o0*o0+d1*d0*o1*o0+d1*d0*o0*o1+d1*d0*o1*o1+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o1*o0+d0*d1*o1*o1+d1*d0*o0*o0+d1*d0*o1*o0+d1*d0*o0*o1+d1*d0*o1*o1+d0*d1*o1*o0+d0*d1*o1*o1)+ (1-He)* (d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1)+He* (d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o0*o2+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o0*o2+d1*d1*o1*o0+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o1*o0+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o2*o0)+ (1-He)* (d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o1*o1)+He* (d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o1*o0+d0*d0*o1*o1+d0*d0*o1*o0+d0*d0*o1*o1)+ (1-He)* (d0*d0*o0*o2+d0*d0*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1)+He* (d1*d0*o0*o1+d1*d0*o1*o1+d1*d0*o0*o2+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o1*o0+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d1*d0*o0*o1+d1*d0*o1*o1+d1*d0*o0*o2+d0*d1*o1*o0+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d1*d0*o1*o0+d1*d0*o1*o0+d1*d0*o2*o0+d1*d0*o1*o1+d1*d0*o1*o1+d1*d0*o1*o1+d1*d0*o1*o1+d0*d1*o2*o0+d0*d1*o2*o0+d1*d0*o2*o0)+He* (d0*d0*o0*o2+d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o2*o0)); {exp_freq_obs := sqr(1-sqr(E1)); } {P(AA.AA-type repeat genotype)} exp_freq[1] := ((1-He)* (d0*d0*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1)+He* (d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1)) /exp_freq_obs; {P(AB.AB-type repeat genotype)} exp_freq[2] := (He* (d0*d0*o0*o0)) /exp_freq_obs; {P(AA.AB-type repeat genotype)} exp_freq[3] := ((1-He)* (d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1)+He* (d1*d0*o0*o0+d1*d0*o1*o0+d1*d0*o0*o1+d1*d0*o1*o1+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o0*o0+d0*d1*o0*o1+d0*d1*o1*o0+d0*d1*o1*o1+d1*d0*o0*o0+d1*d0*o1*o0+d1*d0*o0*o1+d1*d0*o1*o1+d0*d1*o1*o0+d0*d1*o1*o1)) /exp_freq_obs; {P(AA.BB-type repeat genotype)} exp_freq[4] := ((1-He)* (d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1)+He* (d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o0*o2+d1*d1*o0*o0+d1*d1*o0*o1+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o0*o1+d1*d1*o0*o1+d1*d1*o0*o2+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o0*o2+d1*d1*o1*o0+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o1*o0+d1*d1*o1*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o1*o1+d1*d1*o2*o0+d1*d1*o2*o0)) /exp_freq_obs; {P(AB.AC-type repeat genotype)} exp_freq[5] := ((1-He)* (d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o1*o1)+He* (d0*d0*o0*o1+d0*d0*o0*o1+d0*d0*o1*o0+d0*d0*o1*o1+d0*d0*o1*o0+d0*d0*o1*o1)) /exp_freq_obs; {P(AB.CC-type repeat genotype)} exp_freq[6] := ((1-He)* (d0*d0*o0*o2+d0*d0*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1)+He* (d1*d0*o0*o1+d1*d0*o1*o1+d1*d0*o0*o2+d0*d1*o0*o1+d0*d1*o0*o1+d0*d1*o0*o2+d0*d1*o0*o2+d0*d1*o1*o0+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d1*d0*o0*o1+d1*d0*o1*o1+d1*d0*o0*o2+d0*d1*o1*o0+d0*d1*o1*o1+d0*d1*o1*o1+d0*d1*o1*o1+d1*d0*o1*o0+d1*d0*o1*o0+d1*d0*o2*o0+d1*d0*o1*o1+d1*d0*o1*o1+d1*d0*o1*o1+d1*d0*o1*o1+d0*d1*o2*o0+d0*d1*o2*o0+d1*d0*o2*o0)) /exp_freq_obs; {P(AB.CD-type repeat genotype)} exp_freq[7] := (He* (d0*d0*o0*o2+d0*d0*o1*o1+d0*d0*o1*o1+d0*d0*o2*o0){+power(E2,4)+4*(1-E2)*power(E2,3)}) /exp_freq_obs; {the multinomial equation which gives the log likelihood of the observed frequencies (cat[1...7]) given the expected frequencies} result := ln((fact(tot_obs) /(fact(cat[1])*fact(cat[2])*fact(cat[3]) *fact(cat[4])*fact(cat[5])*fact(cat[6])*fact(cat[7]))) *power(exp_freq[1],cat[1])*power(exp_freq[2],cat[2]) *power(exp_freq[3],cat[3])*power(exp_freq[4],cat[4]) *power(exp_freq[5],cat[5])*power(exp_freq[6],cat[6]) *power(exp_freq[7],cat[7])); end; procedure TtrialForm1.simbuttonClick(Sender: TObject); begin label1.caption := ('Using simulated data'); output1.Lines.Add(''); output1.Lines.Add('Starting simulation...'); get_parameters; {sim_allele_freq;} fast_settings := 0; see_genotypes := 1; see_categories := 0; {set this to zero to hide categories during simulation} see_cat_summary := 1; see_exp_freq := 1; see_exp_num := 1; see_walk := 1; see_walk_graph := 1; see_sim_cat := 1; see_data_cat := 0; see_ml_vs_ref := 1; see_ml_vs_ref_graph := 0; ref_data_fraction := 1; simdata; getlikelihood; if see_walk_graph = 1 then noof_steps := strtoint(inputbox('Enter number of search steps', 'At least 1000', inttostr(noof_steps))); maxlikelihood; if see_walk_graph = 1 then begin series1.clear; series2.clear; series48.clear; series49.clear; series28.clear; series29.clear; confidence_limits; output1.Lines.Add('Confidence limits calculated. Click "CI" tab.'); end; end; procedure TtrialForm1.multisimbuttonClick(Sender: TObject); begin label1.caption := ('Using simulated data'); multisim; end; procedure TtrialForm1.get_parameters; {asks for input of parameters for simulation} begin He := strtofloat(inputbox('Enter He', 'Between 0 and 1 (not 0 or 1)', floattostr(He))); Fis := strtofloat(inputbox('Enter Fis', 'From -1 to 1', floattostr(Fis))); E1 := strtofloat(inputbox('Enter dropout rate per allele (E1)', 'From 0 to 1', floattostr(E1))); {dropout} E2 := strtofloat(inputbox('Enter false allele rate per allele (E2)', 'From 0 to 1', floattostr(E2))); {false allele} drop_dom := strtofloat(inputbox('What fraction of dropouts are dominant?', 'From 0 to 1', floattostr(drop_dom))); {dropout} NOOF_SAMPLES := strtoint(inputbox('Enter sample size', 'Large samples (>1000) can cause crashes', inttostr(NOOF_SAMPLES))); sim_noof_He_samples := strtoint(inputbox('He estimate sample size', 'Any integer', inttostr(sim_noof_He_samples))); sim_He_stddev := SQRT(1/(20*sim_noof_He_samples))*(1-He)*(1+3*He); end; procedure TtrialForm1.multisim; var m, num_sim: integer; DateTime0, datetime1: TDateTime; mean_error1, mean_error2, mean_per_het_error1, mean_per_pcr_error2, mean_ref_error1, mean_ref_error2, mean_simple_error1, mean_simple_error2, mean_sample_error1, mean_sample_error2, stdv_error1, stdv_error2, stdv_per_het_error1, stdv_per_pcr_error2, stdv_ref_error1, stdv_ref_error2, stdv_simple_error1, stdv_simple_error2, stdv_sample_error1, stdv_sample_error2, stdv_sample_error1_per_allele, stdv_sample_error2_per_allele, ref_data_num: real; mean_pop_dev_error1, mean_pop_dev_error2, mean_sample_dev_error1, mean_sample_dev_error2: real; begin see_genotypes := 0; see_categories := 0; see_cat_summary := 0; see_exp_freq := 0; see_exp_num := 0; see_walk := 0; see_walk_graph := 0; see_sim_cat := 0; see_data_cat := 0; see_ml_vs_ref := 1; see_ml_vs_ref_graph := 1; fast_settings := 1; get_parameters; num_sim := 1000; num_sim := strtoint(inputbox('How many simulated data sets?', '', inttostr(num_sim))); ref_data_fraction := 1; {ref_data_fraction := strtofloat(inputbox('What fraction by ref data method?', 'From 0 to 1', floattostr(ref_data_fraction))); } output1.Lines.Add('This procedure repeats the simulation and error estimation process ' + inttostr(num_sim) + ' times.'); output1.Lines.Add('Preset parameters: He = ' + floattostr(He) + ' Sample size = ' + inttostr(noof_samples) + ' True E1 = ' + floattostr(E1) + ' True E2 = ' + floattostr(E2)); noof_steps := 1000; if (E1<0.01) or (E2<0.01) then noof_steps := 10000; sum_sample_error1_per_allele :=0; sum_sample_error2_per_allele :=0; sum_sqrs_sample_error1_per_allele :=0; sum_sqrs_sample_error2_per_allele :=0; sum_error1 := 0; sum_error2 := 0; sum_per_het_error1 := 0; sum_per_pcr_error2 := 0; sum_ref_error1 := 0; sum_ref_error2 := 0; sum_simple_error1 := 0; sum_simple_error2 := 0; sum_sample_error1 := 0; sum_sample_error2 := 0; sum_sqrs_error1 := 0; sum_sqrs_error2 := 0; sum_sqrs_per_het_error1 := 0; sum_sqrs_per_pcr_error2 := 0; sum_sqrs_ref_error1 := 0; sum_sqrs_ref_error2 := 0; sum_sqrs_simple_error1 := 0; sum_sqrs_simple_error2 := 0; sum_sqrs_sample_error1 := 0; sum_sqrs_sample_error2 := 0; sum_pop_dev_error1 :=0; sum_pop_dev_error2 :=0; sum_sample_dev_error1 :=0; sum_sample_dev_error2 :=0; sum_sqrs_pop_dev_error1 := 0; sum_sqrs_pop_dev_error2 := 0; sum_sqrs_sample_dev_error1 := 0; sum_sqrs_sample_dev_error2 := 0; m := 0; datetime0 := time; series4.clear; series5.clear; series6.clear; series7.clear; series8.clear; series9.clear; series10.clear; series11.clear; series12.clear; series13.clear; series14.clear; series15.clear; series16.clear; series17.clear; series18.clear; series19.clear; series20.clear; series21.clear; series22.clear; series23.clear; series30.clear; series31.clear; series32.clear; series33.clear; series34.clear; series35.clear; series36.clear; series37.clear; series38.clear; series39.clear; series40.clear; series41.clear; series42.clear; series43.clear; series44.clear; series45.clear; series46.clear; series47.clear; sample_inside_90CI := 0; pop_inside_90CI := 0; sample_inside_95CI := 0; pop_inside_95CI := 0; ml_wins_E1 := 0; ref_wins_E1 := 0; draw_E1 := 0; ml_wins_E2 := 0; ref_wins_E2 := 0; draw_E2 := 0; sum_sqrs_ref_data_drop := 0; sum_sqrs_ML_drop := 0; sum_sqrs_ref_data_false := 0; sum_sqrs_ML_false := 0; repeat inc(m); output1.Lines.Add(inttostr(m) + '/' + inttostr(num_sim)); simdata; maxlikelihood; until m = num_sim; Series6.AddXY(true_drop_per_het, true_false_per_pcr,'',clblack); series36.addXY(E1,E2,'',clblack); mean_error1 := sum_error1/num_sim; mean_error2 := sum_error2/num_sim; mean_sample_error1_per_allele := sum_sample_error1_per_allele/num_sim; mean_sample_error2_per_allele := sum_sample_error2_per_allele/num_sim; mean_per_het_error1 := sum_per_het_error1/num_sim; mean_per_pcr_error2 := sum_per_pcr_error2/num_sim; mean_ref_error1 := sum_ref_error1/num_sim; mean_ref_error2 := sum_ref_error2/num_sim; mean_simple_error1 := sum_simple_error1/num_sim; mean_simple_error2 := sum_simple_error2/num_sim; mean_sample_error1 := sum_sample_error1/num_sim; mean_sample_error2 := sum_sample_error2/num_sim; mean_pop_dev_error1 := sum_pop_dev_error1/num_sim; mean_pop_dev_error2 := sum_pop_dev_error2/num_sim; mean_sample_dev_error1 := sum_sample_dev_error1/num_sim; mean_sample_dev_error2 := sum_sample_dev_error2/num_sim; if see_test_stats=1 then begin summary.lines.add(''); summary.lines.add('Mean error between estimated E1 and sample E1 = '+floattostr(mean_sample_dev_error1)); if mean_sample_error1_per_allele>0 then summary.lines.add('Relative bias in E1 = '+floattostr(mean_sample_dev_error1/mean_sample_error1_per_allele)); summary.lines.add('Mean error between estimated E2 and sample E2 = '+floattostr(mean_sample_dev_error2)); if mean_sample_error2_per_allele>0 then summary.lines.add('Relative bias in E2 = '+floattostr(mean_sample_dev_error2/mean_sample_error2_per_allele)); summary.lines.add('Root mean squared error between estimated E1 and population E1 = '+floattostr(sqrt(sum_sqrs_pop_dev_error1/num_sim))); summary.lines.add('Root mean squared error between estimated E2 and population E2 = '+floattostr(sqrt(sum_sqrs_pop_dev_error2/num_sim))); summary.lines.add('Root mean squared error between estimated E1 and sample E1 = '+floattostr(sqrt(sum_sqrs_sample_dev_error1/num_sim))); summary.lines.add('Root mean squared error between estimated E2 and sample E2 = '+floattostr(sqrt(sum_sqrs_sample_dev_error2/num_sim))); summary.lines.add('Sample MSE divided by population MSE (intrinsic error) for E1 = '+floattostr(sum_sqrs_sample_dev_error1/sum_sqrs_pop_dev_error1)); summary.lines.add('Sample MSE divided by population MSE (intrinsic error) for E2 = '+floattostr(sum_sqrs_sample_dev_error2/sum_sqrs_pop_dev_error2)); summary.lines.add(''); summary.lines.add(floattostr(100*sample_inside_95CI/num_sim)+'% of sample error rates inside 95% CI'); summary.lines.add(floattostr(100*pop_inside_95CI/num_sim)+'% of population error rates inside 95% CI'); summary.lines.add(floattostr(100*sample_inside_90CI/num_sim)+'% of sample error rates inside 90% CI'); summary.lines.add(floattostr(100*pop_inside_90CI/num_sim)+'% of population error rates inside 90% CI'); summary.lines.add(''); summary.lines.add('E1: ML wins = '+floattostr(100*ml_wins_E1/num_sim)+'%, losses = '+floattostr(100*ref_wins_E1/num_sim)+'%, draws = '+floattostr(100*draw_E1/num_sim)+'%.'); summary.lines.add('E2: ML wins = '+floattostr(100*ml_wins_E2/num_sim)+'%, losses = '+floattostr(100*ref_wins_E2/num_sim)+'%, draws = '+floattostr(100*draw_E2/num_sim)+'%.'); summary.lines.add(''); summary.lines.add('ML vs Ref Data (dropout): ML SS = '+floattostr(sum_sqrs_ML_drop)); summary.lines.add('ML vs Ref Data (dropout): Ref SS = '+floattostr(sum_sqrs_ref_data_drop)); summary.lines.add('ML vs Ref Data (false alleles): ML SS = '+floattostr(sum_sqrs_ML_false)); summary.lines.add('ML vs Ref Data (false alleles): Ref SS = '+floattostr(sum_sqrs_ref_data_false)); summary.lines.add('ML was this much better on dropout: '+floattostr(sum_sqrs_ML_drop/sum_sqrs_ref_data_drop)); summary.lines.add('ML was this much better on false alleles: '+floattostr(sum_sqrs_ML_false/sum_sqrs_ref_data_false)); end; Series7.AddXY(mean_per_het_error1,mean_per_pcr_error2,'',clred); Series8.AddXY(mean_ref_error1,mean_ref_error2,'',clyellow); series17.addXY(mean_simple_error1,mean_simple_error2,'',clblack); series34.addXY(mean_sample_error1,mean_sample_error2,'',clblack); series37.addXY(mean_error1,mean_error2,'',clred); series42.addXY(mean_sample_error1_per_allele,mean_sample_error2_per_allele,'',clYellow); stdv_error1 := sqrt((num_sim*sum_sqrs_error1-sqr(sum_error1))/(num_sim*(num_sim-1))); stdv_error2 := sqrt((num_sim*sum_sqrs_error2-sqr(sum_error2))/(num_sim*(num_sim-1))); stdv_sample_error1_per_allele := sqrt((num_sim*sum_sqrs_sample_error1_per_allele-sqr(sum_sample_error1_per_allele))/(num_sim*(num_sim-1))); stdv_sample_error2_per_allele := sqrt((num_sim*sum_sqrs_sample_error2_per_allele-sqr(sum_sample_error2_per_allele))/(num_sim*(num_sim-1))); stdv_per_het_error1 := sqrt((num_sim*sum_sqrs_per_het_error1-sqr(sum_per_het_error1))/(num_sim*(num_sim-1))); stdv_per_pcr_error2 := sqrt((num_sim*sum_sqrs_per_pcr_error2-sqr(sum_per_pcr_error2))/(num_sim*(num_sim-1))); stdv_ref_error1 := sqrt((num_sim*sum_sqrs_ref_error1-sqr(sum_ref_error1))/(num_sim*(num_sim-1))); stdv_ref_error2 := sqrt((num_sim*sum_sqrs_ref_error2-sqr(sum_ref_error2))/(num_sim*(num_sim-1))); stdv_simple_error1 := sqrt((num_sim*sum_sqrs_simple_error1-sqr(sum_simple_error1))/(num_sim*(num_sim-1))); stdv_simple_error2 := sqrt((num_sim*sum_sqrs_simple_error2-sqr(sum_simple_error2))/(num_sim*(num_sim-1))); stdv_sample_error1 := sqrt((num_sim*sum_sqrs_sample_error1-sqr(sum_sample_error1))/(num_sim*(num_sim-1))); stdv_sample_error2 := sqrt((num_sim*sum_sqrs_sample_error2-sqr(sum_sample_error2))/(num_sim*(num_sim-1))); series38.addxy((mean_error1-stdv_error1), mean_error2,'',clred); series38.addxy((mean_error1+stdv_error1), mean_error2,'',clred); series39.addxy(mean_error1, (mean_error2-stdv_error2),'',clred); series39.addxy(mean_error1, (mean_error2+stdv_error2),'',clred); series40.addxy((mean_error1-stdv_error1), mean_error2,'',clred); series40.addxy((mean_error1+stdv_error1), mean_error2,'',clred); series41.addxy(mean_error1, (mean_error2-stdv_error2),'',clred); series41.addxy(mean_error1, (mean_error2+stdv_error2),'',clred); series44.addxy((mean_sample_error1_per_allele-stdv_sample_error1_per_allele), mean_sample_error2_per_allele,'',clblack); series44.addxy((mean_sample_error1_per_allele+stdv_sample_error1_per_allele), mean_sample_error2_per_allele,'',clblack); series45.addxy(mean_sample_error1_per_allele, (mean_sample_error2_per_allele-stdv_sample_error2_per_allele),'',clblack); series45.addxy(mean_sample_error1_per_allele, (mean_sample_error2_per_allele+stdv_sample_error2_per_allele),'',clblack); series46.addxy((mean_sample_error1_per_allele-stdv_sample_error1_per_allele), mean_sample_error2_per_allele,'',clblack); series46.addxy((mean_sample_error1_per_allele+stdv_sample_error1_per_allele), mean_sample_error2_per_allele,'',clblack); series47.addxy(mean_sample_error1_per_allele, (mean_sample_error2_per_allele-stdv_sample_error2_per_allele),'',clblack); series47.addxy(mean_sample_error1_per_allele, (mean_sample_error2_per_allele+stdv_sample_error2_per_allele),'',clblack); series9.addxy((mean_per_het_error1-stdv_per_het_error1), mean_per_pcr_error2,'',clred); series9.addxy((mean_per_het_error1+stdv_per_het_error1), mean_per_pcr_error2,'',clred); series10.addxy(mean_per_het_error1, (mean_per_pcr_error2-stdv_per_pcr_error2),'',clred); series10.addxy(mean_per_het_error1, (mean_per_pcr_error2+stdv_per_pcr_error2),'',clred); series21.addxy((mean_simple_error1-stdv_simple_error1), mean_simple_error2,'',clblack); series21.addxy((mean_simple_error1+stdv_simple_error1), mean_simple_error2,'',clblack); series22.addxy(mean_simple_error1, (mean_simple_error2-stdv_simple_error2),'',clblack); series22.addxy(mean_simple_error1, (mean_simple_error2+stdv_simple_error2),'',clblack); series32.addxy((mean_sample_error1-stdv_sample_error1), mean_sample_error2,'',clblack); series32.addxy((mean_sample_error1+stdv_sample_error1), mean_sample_error2,'',clblack); series33.addxy(mean_sample_error1, (mean_sample_error2-stdv_sample_error2),'',clblack); series33.addxy(mean_sample_error1, (mean_sample_error2+stdv_sample_error2),'',clblack); series11.addxy((mean_ref_error1-stdv_ref_error1), mean_ref_error2,'',clblue); series11.addxy((mean_ref_error1+stdv_ref_error1), mean_ref_error2,'',clblue); series12.addxy(mean_ref_error1, (mean_ref_error2-stdv_ref_error2),'',clblue); series12.addxy(mean_ref_error1, (mean_ref_error2+stdv_ref_error2),'',clblue); series14.addxy((mean_per_het_error1-stdv_per_het_error1), mean_per_pcr_error2,'',clred); series14.addxy((mean_per_het_error1+stdv_per_het_error1), mean_per_pcr_error2,'',clred); series13.addxy(mean_per_het_error1, (mean_per_pcr_error2-stdv_per_pcr_error2),'',clred); series13.addxy(mean_per_het_error1, (mean_per_pcr_error2+stdv_per_pcr_error2),'',clred); series20.addxy((mean_simple_error1-stdv_simple_error1), mean_simple_error2,'',clblack); series20.addxy((mean_simple_error1+stdv_simple_error1), mean_simple_error2,'',clblack); series19.addxy(mean_simple_error1, (mean_simple_error2-stdv_simple_error2),'',clblack); series19.addxy(mean_simple_error1, (mean_simple_error2+stdv_simple_error2),'',clblack); series16.addxy((mean_ref_error1-stdv_ref_error1), mean_ref_error2,'',clblue); series16.addxy((mean_ref_error1+stdv_ref_error1), mean_ref_error2,'',clblue); series15.addxy(mean_ref_error1, (mean_ref_error2-stdv_ref_error2),'',clblue); series15.addxy(mean_ref_error1, (mean_ref_error2+stdv_ref_error2),'',clblue); series31.addxy((mean_sample_error1-stdv_sample_error1), mean_sample_error2,'',clblack); series31.addxy((mean_sample_error1+stdv_sample_error1), mean_sample_error2,'',clblack); series30.addxy(mean_sample_error1, (mean_sample_error2-stdv_sample_error2),'',clblack); series30.addxy(mean_sample_error1, (mean_sample_error2+stdv_sample_error2),'',clblack); ref_data_num := ref_data_fraction * noof_samples; output1.Lines.Add(''); datetime1 := time - datetime0; output1.Lines.Add('It took ' + timetostr(datetime1) + ' hh:mm:ss to simulate and estimate errors from ' + inttostr(num_sim) + ' data sets.'); if ref_data_fraction < 1 then begin output1.Lines.Add('Click "Method comparison" tab to compare the ' + inttostr(num_sim) + ' ML error estimates with reference data counts from '+floattostr(ref_data_num)+' of the simulated data sets.'); end else begin output1.Lines.Add('Click "Method comparison" tab to compare the ' + inttostr(num_sim) + ' ML error estimates with reference data counts from the same simulated data sets.'); end; beep; summary.lines.add(''); summary.Lines.Add(inttostr(num_sim) + ' data sets generated using the following parameters:'); summary.lines.add(''); summary.lines.add('Expected heterozygosity (He) = ' + floattostr(He)); summary.lines.add('Allelic dropout error rate (E1) = ' + floattostrf(E1, ffnumber, 8,4)); summary.lines.add('False allele error rate (E2) = ' + floattostrf(E2, ffnumber, 8, 4)); summary.lines.add('Dominance of allelic dropout = ' + floattostr(drop_dom)); summary.Lines.add('Number of replicate genotypes = ' + inttostr(noof_samples)); summary.lines.add(''); summary.Lines.Add('Mean error estimates (standard deviation):'); summary.Lines.Add(''); summary.Lines.Add('Per-allele error rates:'); summary.Lines.Add('ML estimates: E1 = ' + floattostrf(mean_error1, ffnumber, 8,5) + ' (' + floattostrf(stdv_error1, ffnumber, 8, 5) + '), E2 = '+ floattostrf(mean_error2, ffnumber, 8, 5) + ' (' + floattostrf(stdv_error2, ffnumber, 8, 5) + ')'); summary.Lines.Add(''); summary.Lines.Add('Per-genotype error rates:'); summary.Lines.Add('Ref data estimates: ADO rate = ' + floattostrf(mean_ref_error1, ffnumber, 8,4) + ' (' + floattostrf(stdv_ref_error1, ffnumber, 8, 4) + '), FA rate = '+ floattostrf(mean_ref_error2, ffnumber, 8, 4) + ' (' + floattostrf(stdv_ref_error2, ffnumber, 8, 4)+') (based on ' + floattostr(ref_data_num) + ' reference samples)'); summary.Lines.Add('Simple method estimates: ADO rate = ' + floattostrf(mean_simple_error1, ffnumber, 8,4) + ' (' + floattostrf(stdv_simple_error1, ffnumber, 8, 4) + '), FA rate = '+ floattostrf(mean_simple_error2, ffnumber, 8, 4) + ' (' + floattostrf(stdv_simple_error2, ffnumber, 8, 4)+')'); summary.Lines.Add('ML estimates: ADO rate = ' + floattostrf(mean_per_het_error1, ffnumber, 8,4) + ' (' + floattostrf(stdv_per_het_error1, ffnumber, 8, 4) + '), FA rate = '+ floattostrf(mean_per_pcr_error2, ffnumber, 8, 4) + ' (' + floattostrf(stdv_per_pcr_error2, ffnumber, 8, 4)+')'); fast_settings := 0; end; procedure TtrialForm1.simdata; var pcr1_sum, pcr2_sum, true_genotype_sum, no_error_hom, no_error_het, doubledrop, doubledrop_het, E2_hom, E2E2_hom, E1E2_hom, E1_het, E1E2_het, E2_het, E2E2_het, obs_het, obs_hom, obs_het2, obs_hom2, allele1, allele2, allele3, allele4, trueallele1, trueallele2, falseallele1, falseallele2, falseallele3, falseallele4, sample_drop_het, sample_doubledrop_het, sample_doubledrop_hom, sample_false_visible, sample_obs_het, sample_obs_hom, n, h: Integer; falserand, droprand1, droprand2, noof_refsamples: real; true_E1, true_E2: real; begin sim_or_real := 0; n := 1; sim_cat[1]:=0; sim_cat[2]:=0; sim_cat[3]:=0; sim_cat[4]:=0; sim_cat[5]:=0; sim_cat[6]:=0; sim_cat[7]:=0; sim_cat[8]:=0; sample_error[1] := 0; sample_error[2] := 0; sample_drop_het := 0; obs_het := 0; obs_hom := 0; obs_het2 := 0; obs_hom2 := 0; no_error_het := 0; no_error_hom := 0; doubledrop := 0; doubledrop_het := 0; sample_doubledrop_het := 0; sample_doubledrop_hom := 0; sample_obs_het := 0; sample_obs_hom := 0; sample_false_visible := 0; E2_hom := 0; E2E2_hom := 0; E1E2_hom := 0; E1_het := 0; E1E2_het := 0; E2_het := 0; E2E2_het := 0; noof_refsamples := noof_samples * ref_data_fraction; drop_per_het := 0; false_per_pcr := 0; randomize; output1.Lines.Add(''); output1.Lines.Add('Simulating the repeat-genotyping of '+inttostr(noof_samples)+' DNA samples.'); output1.Lines.Add('Parameters:'); output1.Lines.Add('E1 = '+floattostr(E1)); output1.Lines.Add('E2 = '+floattostr(E2)); output1.Lines.Add('He = '+floattostr(He)); repeat if variable_sample_quality = 1 then begin {remember true values of E1 and E2, to be reassigned at end of repeat-until loop} true_E1:=E1; true_E2:=E2; {simulate variable sample quality by multiplying E1 and E2 by a factor that is an S-shaped function of n. It averages about 0.1 over the 1st 25% of samples to about 2.5 over the last 10%} E1:=E1*((2/3)*sqr(1-COS(PI*(n-1)/(noof_samples-1)))+(noof_samples/2-n)/(noof_samples*5)); E2:=E2*((2/3)*sqr(1-COS(PI*(n-1)/(noof_samples-1)))+(noof_samples/2-n)/(noof_samples*5)); end; {simulates the first PCR} allele1 := 1; droprand1 := random; droprand2 := random; if random < (He*(1-Fis)) then allele2 := 2 {the true genotype will be heterozygous at probability = He} else allele2 :=1; {otherwise it will be homozygous} trueallele1 := allele1; trueallele2 := allele2; falserand := random; if falserand < E2 then allele1 := 4; if falserand < E2 then sample_error[2] := sample_error[2] + 1; if (falserand < E2) and (droprand1 > sqr(E1)) and (droprand2 > sqr(E1)) and (droprand1 > sqr(E1)+E1*(1-E1)) then sample_false_visible := sample_false_visible + 1; falserand := random; if falserand < E2 then allele2 := 8; if falserand < E2 then sample_error[2] := sample_error[2] + 1; if (falserand < E2) and (droprand1 > sqr(E1)) and (droprand2 > sqr(E1)) and ((droprand1 < sqr(E1)+E1*(1-E1)) or (droprand1 > sqr(E1)+2*E1*(1-E1))) then sample_false_visible := sample_false_visible + 1; falseallele1 := allele1; {falsealleleX is the state of alleleX following false allele arror but before dropout. It isn't necesssarily false.} falseallele2 := allele2; if ((allele1 >= 4) and (random > drop_dom)) then {then allele1 stays the same} else if droprand1 < sqr(E1) then allele1 := 0; {the value 0 represents a double dropout and allows me to eliminate these later by checking if pcr_sum = 0} if ((allele2 >= 4) and (random > drop_dom)) then {then allele2 stays the same} else if droprand1 < sqr(E1) then allele2 := 0; if ((allele1 = 0) or (allele2 = 0) and (allele1 + allele2 > 0)) then begin allele1 := max(allele1,allele2); allele2 := allele1; end; if droprand1 < sqr(E1) then sample_error[1] := sample_error[1] + 2; if ((allele1 >= 4) and (random > drop_dom)) then {then allele1 stays the same} else if (droprand1 > sqr(E1)) and (droprand1 < sqr(E1)+E1*(1-E1)) then allele1 := allele2; if (droprand1 > sqr(E1)) and (droprand1 < sqr(E1)+E1*(1-E1)) then sample_error[1] := sample_error[1] + 1; if ((allele2 >= 4) and (random > drop_dom)) then {then allele2 stays the same} else if (droprand1 > sqr(E1)+E1*(1-E1)) and (droprand1 < sqr(E1)+2*E1*(1-E1)) then allele2 := allele1; if (droprand1 > sqr(E1)+E1*(1-E1)) and (droprand1 < sqr(E1)+2*E1*(1-E1)) then sample_error[1] := sample_error[1] + 1; if (droprand2 > sqr(E1)) and (droprand1 > sqr(E1)) and (droprand1 < sqr(E1)+2*E1*(1-E1)) and (trueallele1 <> trueallele2) then sample_drop_het := sample_drop_het + 1; {simulates the second PCR} allele3 := 1; allele4 := trueallele2; falserand := random; if falserand < E2 then allele3 := 16; if falserand < E2 then sample_error[2] := sample_error[2] + 1; if (falserand < E2) and (droprand1 > sqr(E1)) and (droprand2 > sqr(E1)) and (droprand2 > sqr(E1)+E1*(1-E1)) then sample_false_visible := sample_false_visible + 1; falserand := random; if falserand < E2 then allele4 := 32; if falserand < E2 then sample_error[2] := sample_error[2] + 1; if (falserand < E2) and (droprand1 > sqr(E1)) and (droprand2 > sqr(E1)) and ((droprand2 < sqr(E1)+E1*(1-E1)) or (droprand2 > sqr(E1)+2*E1*(1-E1))) then sample_false_visible := sample_false_visible + 1; falseallele3 := allele3; falseallele4 := allele4; if ((allele3 >= 4) and (random > drop_dom)) then {then allele3 stays the same} else if droprand2 < sqr(E1) then allele3 := 0; if ((allele4 >= 4) and (random > drop_dom)) then {then allele4 stays the same} else if droprand2 < sqr(E1) then allele4 := 0; if ((allele3 = 0) or (allele4 = 0) and (allele3 + allele4 > 0)) then begin allele3 := max(allele3,allele4); allele4 := allele3; end; if droprand2 < sqr(E1) then sample_error[1] := sample_error[1] + 2; if ((allele3 >= 4) and (random > drop_dom)) then {then allele3 stays the same} else if (droprand2 > sqr(E1)) and (droprand2 < sqr(E1)+E1*(1-E1)) then allele3 := allele4; if (droprand2 > sqr(E1)) and (droprand2 < sqr(E1)+E1*(1-E1)) then sample_error[1] := sample_error[1] + 1; if ((allele4 >= 4) and (random > drop_dom)) then {then allele4 stays the same} else if (droprand2 > sqr(E1)+E1*(1-E1)) and (droprand2 < sqr(E1)+2*E1*(1-E1)) then allele4 := allele3; if (droprand2 > sqr(E1)+E1*(1-E1)) and (droprand2 < sqr(E1)+2*E1*(1-E1)) then sample_error[1] := sample_error[1] + 1; if (droprand1 > sqr(E1)) and (droprand2 > sqr(E1)) and (droprand2 < sqr(E1)+2*E1*(1-E1)) and (trueallele1 <> trueallele2) then sample_drop_het := sample_drop_het + 1; if ((droprand1 < sqr(E1)) or (droprand2 < sqr(E1))) and (trueallele1 <> trueallele2) then sample_doubledrop_het := sample_doubledrop_het + 1; if ((droprand1 < sqr(E1)) or (droprand2 < sqr(E1))) and (trueallele1 = trueallele2) then sample_doubledrop_hom := sample_doubledrop_hom + 1; if (trueallele1 <> trueallele2) then sample_obs_het := sample_obs_het + 1; if (trueallele1 = trueallele2) then sample_obs_hom := sample_obs_hom + 1; {The alleles are so named that every genotype from a single pcr sums to a different value, pcr1_sum and pcr2_sum. This simplifies some of the categorisation statements} pcr1_sum := allele1 + allele2; pcr2_sum := allele3 + allele4; {decision tree to categorise the double pcr reads into the 8 categories} if ((pcr1_sum = 0) or (pcr2_sum = 0)) {counts any double dropout (DD), eg 0012 into category 8} then sim_cat[8] := sim_cat[8] + 1 else if (pcr1_sum = pcr2_sum) and (allele1 = allele2) {counts apparent double homozygotes, eg 1111, into category 1} then sim_cat[1] := sim_cat[1] + 1 else if (pcr1_sum = pcr2_sum) and (allele1 <> allele2) {counts true heterozygotes, eg 1212, into category 2} then sim_cat[2] := sim_cat[2] + 1 else if (pcr1_sum <> pcr2_sum) {counts 1222-type errors into category 3} and ((((allele1 = allele2) and (allele3 <> allele4)) and ((allele1 = allele3) or (allele1 = allele4))) or (((allele1 <> allele2) and (allele3 = allele4)) and ((allele3 = allele1) or (allele3 = allele2)))) then sim_cat[3] := sim_cat[3] + 1 else if (pcr1_sum <> pcr2_sum) and (allele1 = allele2) and (allele3=allele4) {counts 1122-type errors into category 4} then sim_cat[4] := sim_cat[4] + 1 else if (pcr1_sum <> pcr2_sum) and (allele1 <> allele2) and (allele3 <> allele4) {counts 1814-type errors into category 5} and ((allele1 = allele3) or (allele1 = allele4) or (allele2 = allele3) or (allele2 = allele4)) then sim_cat[5] := sim_cat[5] + 1 else if (pcr1_sum <> pcr2_sum) {counts 1124-type errors into category 6} and (((allele1 <> allele2) and (allele3 = allele4)) or ((allele1 = allele2) and (allele3 <> allele4))) and ((allele1 <> allele3) and (allele1 <> allele4) and (allele2 <> allele3) and (allele2 <> allele4)) then sim_cat[6] := sim_cat[6] + 1 else if (allele1 <> allele2) and (allele2 <> allele3) and (allele3 <> allele4) and (allele2 <> allele3) and (allele2 <> allele4) and (allele3 <> allele4) then sim_cat[7] := sim_cat[7] + 1; {decision tree to estimate the error rates with reference data based on PCR1} if (n <= noof_refsamples) then begin if (trueallele1 <> trueallele2) then obs_het := obs_het + 1; if (trueallele1 = trueallele2) then obs_hom := obs_hom + 1; if (trueallele1 <> trueallele2) and (pcr1_sum = 0) then doubledrop_het := doubledrop_het + 1; if (trueallele1 <> trueallele2) and (trueallele1 = allele1) and (trueallele2 = allele2) then no_error_het := no_error_het + 1 else if (pcr1_sum = 0) then doubledrop := doubledrop + 1 else if (trueallele1 <> trueallele2) and (allele1 = allele2) and ((trueallele1 = allele1) or (trueallele1 = allele2) or (trueallele2 = allele1) or (trueallele2 = allele2)) then E1_het := E1_het + 1 else if (trueallele1 <> trueallele2) and (allele1 = allele2) and (trueallele1 <> allele1) and (trueallele1 <> allele2) and (trueallele2 <> allele1) and (trueallele2 <> allele2) then E1E2_het := E1E2_het + 1 else if (trueallele1 <> trueallele2) and (allele1 <> allele2) and (trueallele1 <> allele1) and (trueallele1 <> allele2) and (trueallele2 <> allele1) and (trueallele2 <> allele2) then E2E2_het := E2E2_het + 1 else if (trueallele1 <> trueallele2) and (pcr1_sum <> 3) and ((trueallele1 = allele1) or (trueallele1 = allele2) or (trueallele2 = allele1) or (trueallele2 = allele2)) then E2_het := E2_het + 1 else if (trueallele1 = trueallele2) and (trueallele1 = allele1) and (trueallele2 = allele2) then no_error_hom := no_error_hom + 1 else if (trueallele1 = trueallele2) and (pcr1_sum <> 0) and ((trueallele1 = allele1) or (trueallele1 = allele2)) then E2_hom := E2_hom + 1 else if (trueallele1 = trueallele2) and (pcr1_sum <> 0) and (allele1 <> allele2) then E2E2_hom := E2E2_hom + 1 else E1E2_hom := E1E2_hom + 1; end; if see_genotypes = 1 then output1.Lines.Add(inttostr(n)+': '+inttostr(trueallele1)+' '+inttostr(trueallele2)+' ' +inttostr(falseallele1)+' '+inttostr(falseallele2)+' '+inttostr(falseallele3)+' '+inttostr(falseallele4) +' '+inttostr(allele1)+' '+inttostr(allele2)+' '+inttostr(allele3)+' '+inttostr(allele4)); if see_categories = 1 then output1.Lines.Add(' C1:'+inttostr(sim_cat[1])+' C2:'+inttostr(sim_cat[2])+' C3:'+ inttostr(sim_cat[3])+' C4:'+inttostr(sim_cat[4])+' C5:'+inttostr(sim_cat[5])+' C6:'+ inttostr(sim_cat[6])+' C7:'+inttostr(sim_cat[7])+' DD:'+inttostr(sim_cat[8])+' Sum:'+ inttostr(sim_cat[1]+sim_cat[2]+sim_cat[3]+sim_cat[4]+sim_cat[5]+sim_cat[6]+sim_cat[7]+sim_cat[8])); n := n + 1; if variable_sample_quality = 1 then begin E1:=true_E1; {reassign population parameter values to E1 and E2} E2:=true_E2; end; until n = NOOF_SAMPLES + 1; if ((sim_cat[1]+sim_cat[2]+sim_cat[3]+sim_cat[4]+sim_cat[5]+sim_cat[6]+sim_cat[7]+sim_cat[8]) <> NOOF_SAMPLES) then output1.Lines.Add('Error: over- or under-counting data. Please check input file.'); tot_sim_obs := sim_cat[1]+sim_cat[2]+sim_cat[3]+sim_cat[4]+sim_cat[5]+sim_cat[6]+sim_cat[7]; if see_sim_cat = 1 then begin for h := 1 to 8 do output1.Lines.Add('Simulated number in category ' + inttostr(h) + ': '+inttostr(sim_cat[h])); output1.Lines.Add(' ----------------------'); output1.Lines.Add('Sum of observable categories 1-7: '+inttostr(tot_sim_obs)); output1.Lines.Add(''); end; drop_per_het := (E1_het + E1E2_het) / (obs_het - doubledrop_het); false_per_pcr := (obs_het + obs_hom - doubledrop - E1_het - no_error_hom - no_error_het) / (obs_het + obs_hom - doubledrop) ; simple_error2_per_pcr := (1/(2*noof_samples*He))*(2*He*noof_samples-2*sqrt(sqr(noof_samples*He)-He*noof_samples*(sim_cat[5]+sim_cat[6]+sim_cat[7]))); if sqr(noof_samples*He)-2*He*noof_samples*sim_cat[3]+4*He*sqr(noof_samples)*simple_error2_per_pcr-4*He*sqr(noof_samples)*sqr(simple_error2_per_pcr)-4*sqr(He)*sqr(noof_samples)*simple_error2_per_pcr+4*sqr(He)*sqr(noof_samples)*sqr(simple_error2_per_pcr) < 0 then simple_error1_per_pcr := (1/(4*noof_samples*He))*(2*noof_samples*He) else simple_error1_per_pcr := (1/(4*noof_samples*He)) *(2*noof_samples*He-2*sqrt(sqr(noof_samples*He)-2*He*noof_samples*sim_cat[3]+4*He*sqr(noof_samples)*simple_error2_per_pcr-4*He*sqr(noof_samples)*sqr(simple_error2_per_pcr)-4*sqr(He)*sqr(noof_samples)*simple_error2_per_pcr+4*sqr(He)*sqr(noof_samples)*sqr(simple_error2_per_pcr))); {(E2_het + E2E2_het + E1E2_het + E2_hom + E2E2_hom + E1E2_hom) / (obs_het + obs_hom - doubledrop);} simple_error1_per_het := simple_error1_per_pcr{/He}; sample_drop_per_het := sample_drop_het/(2*(sample_obs_het-sample_doubledrop_het)); sample_false_per_pcr := sample_false_visible/(2*noof_samples-sample_doubledrop_het-sample_doubledrop_hom); sample_error1_per_allele := sample_error[1]/(4*noof_samples); sample_error2_per_allele := sample_error[2]/(4*noof_samples); end; procedure TtrialForm1.getlikelihood; begin if (He <= 0) or (He >= 1) then showmessage('Input He = '+floattostr(He)+'. Values of 1 or 0 are not allowed. Please check input file (if you are anaysing real data) or enter a valid He (if you are doing a simulation).'); if sim_or_real = 0 then output1.Lines.Add('Log likelihood of simulated data = '+floattostr(log_likelihood(E1,E2))) else output1.Lines.Add('Log likelihood of real data = '+floattostr(log_likelihood(E1,E2))); output1.Lines.Add(''); exp_cat[1] := exp_freq[1] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[2] := exp_freq[2] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[3] := exp_freq[3] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[4] := exp_freq[4] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[5] := exp_freq[5] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[6] := exp_freq[6] * (NOOF_SAMPLES - sim_cat[8]); exp_cat[7] := exp_freq[7] * (NOOF_SAMPLES - sim_cat[8]); {exp_freq_obs := exp_cat[1]+exp_cat[2]+exp_cat[3]+exp_cat[4]+exp_cat[5]+exp_cat[6]+exp_cat[7]; } if see_exp_freq = 1 then output1.Lines.Add('Expected frequency of categories: 1: '+floattostr(exp_freq[1])+ ' 2: '+floattostr(exp_freq[2])+ ' 3: '+floattostr(exp_freq[3])+ ' 4: '+floattostr(exp_freq[4])+ ' 5: '+floattostr(exp_freq[5])+ ' 6: '+floattostr(exp_freq[6])+ ' 7: '+floattostr(exp_freq[7])); output1.Lines.Add(''); if see_exp_num = 1 then output1.Lines.Add('Expected number in categories: 1: '+floattostr(exp_cat[1])+ ' 2: '+floattostr(exp_cat[2])+ ' 3: '+floattostr(exp_cat[3])+ ' 4: '+floattostr(exp_cat[4])+ ' 5: '+floattostr(exp_cat[5])+ ' 6: '+floattostr(exp_cat[6])+ ' 7: '+floattostr(exp_cat[7])); output1.Lines.Add(''); end; procedure TtrialForm1.maxlikelihood; {Searches likelihood surface for highest value.} var step, init_step, min_step, newerror1, newerror2, best_yet_error1, best_yet_error2, walkE1, walkE2, ref_error1, ref_error2: real; old_likelihood, new_likelihood, max_yet_log_likelihood: extended; temp, accept: real; noof_extra_steps, i: integer; res, remainder: word; tau, likelihood_ratio, max_likelihood_ratio: extended; not_much_better: integer; begin if (He <= 0) or (He >= 1) then showmessage('Input He = '+floattostr(He)+'. Values of 1 or 0 are not allowed. Please check input file (if you are anaysing real data) or enter a valid He (if you are doing a simulation).'); randomize; i := 0; init_step := 0.1; min_step := 0.000001; error1 := random*0.7+0.01; error2 := random*0.7+0.01; max_likelihood_ratio := exp(1000); temp := 1000; not_much_better := 0; true_He := He; if (sim_or_real = 0) and (He_bias = 1) then begin He := RandG(He,sim_He_stddev); {simulate sampling error in He} {He := 1-sqr(1-E2)*(1-He); } {simulate upward He bias caused by E2 errors} end; if He < 0.00001 then He := 0.00001; if He > 0.99999 then He := 0.99999; sim_He := He; Series3.clear; series24.clear; series25.clear; series26.clear; series27.clear; if noof_steps < 1000 then noof_steps := 1000; max_yet_log_likelihood := (log_likelihood(error1,error2)); best_yet_error1 := error1; best_yet_error2 := error2; repeat inc(i); temp := temp*power(1E-11,1/noof_steps); step := sqrt(random) * (power((init_step), (noof_steps/(1+noof_steps-i))) + min_step); walkE1 := random; walkE2 := random; if (walkE1 < 1/3) then newerror1 := error1 + step else if (walkE1 > 2/3) then newerror1 := error1 -step else newerror1 := error1; if (walkE2 < 1/3) then newerror2 := error2 + step else if (walkE2 > 2/3) then newerror2 := error2 -step else newerror2 := error2; if newerror1 >= 0.7 then newerror1 := 0.7; if newerror1 <= 0 then newerror1 := 0 + step; if newerror2 >= 0.7 then newerror2 := 0.7; if newerror2 <= 0 then newerror2 := 0 + step; old_likelihood := exp(log_likelihood(error1,error2)); new_likelihood := exp(log_likelihood(newerror1,newerror2)); likelihood_ratio := new_likelihood/old_likelihood; {if likelihood_ratio < 1/max_likelihood_ratio then likelihood_ratio := 1/max_likelihood_ratio; } if likelihood_ratio > max_likelihood_ratio then tau := 1 else if likelihood_ratio < 1/max_likelihood_ratio then tau := 0 else {if (likelihood_ratio < max_likelihood_ratio) and (likelihood_ratio > 1/max_likelihood_ratio) then } begin tau := min(power(likelihood_ratio, 1/temp), 1); {simulated annealing algorithm} if i > noof_steps*9/10 then begin if tau < 1 then tau := 0 else tau:=1; end; end; accept := random; if accept < tau then begin error1 := newerror1; error2 := newerror2; end; {this loop cuts to the end of the search if improvements in likelihod are < 1% over all of 10000 consecutive iterations} if exp(log_likelihood(error1,error2))/old_likelihood < 1.01 then inc(not_much_better) else not_much_better := 0; {if not_much_better > 499 then output1.Lines.Add(inttostr(not_much_better));} if (not_much_better > 1000) and (i < noof_steps - 1000) then i := noof_steps - 1000; if noof_steps>=10000 then divmod(i,500,res,remainder) else divmod(i,50,res,remainder); if fast_settings = 0 then begin if (remainder = 0) or (i <= 50) or (i > noof_steps - 50) then output1.Lines.Add(inttostr(i) + '/' + inttostr(noof_steps) + ': E1=' + floattostrf(error1, ffnumber, 8, 6) + ' E2=' + floattostrf(error2, ffnumber, 8, 6) + ' ln(lhood)=' + floattostrf(log_likelihood(error1, error2), ffnumber, 11, 8) {+ ' ln(lhood) ratio=' + floattostr(likelihood_ratio) + ' temp=' + floattostr(temp)} + ' tau=' + floattostr(tau) + ' Step=' + floattostr(step)); end; if (see_walk_graph = 1) and ((single_locus = 1) or (sim_or_real = 0)) then begin Series3.AddXY(error1,error2,'',clblack); Series24.AddXY(i,log_likelihood(error1, error2),'',clblack); Series25.AddXY(i,error1,'',clblack); Series26.AddXY(i,error2,'',clred); {divmod(i,50,res,remainder); if remainder = 0 then Series27.AddXY(i,tau,'',clblue);} end; {this IF THEN test remembers the most likely errors 1&2 visited throughout the search so that if the search happens to end on a less likely peak, eg a local optimum, the search will hopefully have previously visited and remembered the global optimum} if log_likelihood(error1,error2) > max_yet_log_likelihood then begin max_yet_log_likelihood := log_likelihood(error1,error2); best_yet_error1 := error1; best_yet_error2 := error2; end; until (i = noof_steps); {this IF THEN recalls the best estimates found throughout the search} {but now removed so that it always uses this refining step} { if log_likelihood(error1,error2) < max_yet_log_likelihood then begin } error1 := best_yet_error1; error2 := best_yet_error2; noof_extra_steps := 1000; i := 0; repeat inc(i); step := 0.005*EXP(0.005*-i)-3.36897E-05; walkE1 := random; walkE2 := random; if (walkE1 < 1/3) then newerror1 := error1 + step else if (walkE1 > 2/3) then newerror1 := error1 -step else newerror1 := error1; if (walkE2 < 1/3) then newerror2 := error2 + step else if (walkE2 > 2/3) then newerror2 := error2 -step else newerror2 := error2; if newerror1 >= 0.7 then newerror1 := 0.7; if newerror1 <= 0 then newerror1 := 0 + step; if newerror2 >= 0.7 then newerror2 := 0.7; if newerror2 <= 0 then newerror2 := 0 + step; if log_likelihood(error1,error2) < log_likelihood(newerror1,newerror2) then begin error1 := newerror1; error2 := newerror2; end; {output1.Lines.Add(inttostr(i) + '/' + inttostr(noof_extra_steps) + ': E1=' + floattostrf(error1, ffnumber, 8, 6) + ' E2=' + floattostrf(error2, ffnumber, 8, 6) + ' ln(lhood)=' + floattostrf(log_likelihood(error1, error2), ffnumber, 11, 8) + ' Step=' + floattostr(step));} until (i = noof_extra_steps); { end; } if sim_or_real = 0 then begin output1.Lines.Add(''); output1.Lines.Add('Simulated He = '+floattostrf(sim_He, ffnumber, 8, 4)+' (SE = '+floattostrf(sim_He_stddev, ffnumber, 8, 6)+')'); output1.Lines.Add('Population errors: E1=' + floattostrf(E1, ffnumber, 8, 4) + ', E2=' + floattostrf(E2, ffnumber, 8, 4)); output1.Lines.Add('Sample errors: E1=' + floattostrf(sample_error[1]/(4*NOOF_SAMPLES), ffnumber, 8, 4) + ', E2=' + floattostrf(sample_error[2]/(4*NOOF_SAMPLES), ffnumber, 8, 4)); output1.Lines.Add('ML estimate errors: E1=' + floattostrf(error1, ffnumber, 8, 4) + ', E2=' + floattostrf(error2, ffnumber, 8, 4) + ' Max log likelihood = ' + floattostrf(log_likelihood(error1,error2), ffnumber, 8, 4)) end else begin output1.Lines.Add('Estimated E1 = ' + floattostrf(error1, ffnumber, 8, 4) + ' Estimated E2 = ' + floattostrf(error2, ffnumber, 8, 4) + ' Max likelihood = ' + floattostrf(log_likelihood(error1,error2), ffnumber, 8, 4)); end; output1.Lines.Add(''); est_drop_per_het := (error1 * 2) / (error1 + 1); est_false_per_pcr := error2 * (error1 * error2 - error2 + 2) / (error1 + 1); {If you ignore the effect of dropout on E2 then the above equation is "est_false_per_pcr := (1 - sqr(1 - error2));" but this will over-estimate E2 at high dropout rates} true_drop_per_het := (E1 * 2) / (E1 + 1); true_false_per_pcr := E2 * (E1 * E2 - E2 + 2) / (E1 + 1); {same applies to this eqn (1 - sqr(1 - E2)); } if see_ml_vs_ref = 1 then begin output1.Lines.Add('Ref ADO/het: ' + floattostrf(drop_per_het, ffnumber, 8, 4) + ' Ref FA/PCR: ' + floattostrf(false_per_pcr, ffnumber, 8, 4) + ' ML ADO/het: ' + floattostrf(est_drop_per_het, ffnumber, 8, 4) + ' ML FA/PCR: ' + floattostrf(est_false_per_pcr, ffnumber, 8, 4) + ' True ADO/het: ' + floattostrf(true_drop_per_het, ffnumber, 8, 4) + {remember this is only true for drop_dom = 1} ' True FA/PCR: ' + floattostrf(true_false_per_pcr, ffnumber, 8, 4)); {remember this is only true for drop_dom = 1} if (abs(drop_per_het-est_drop_per_het)< (0.05*((drop_per_het+est_drop_per_het)/2))) or (abs(drop_per_het-est_drop_per_het)<0.0001) then draw_E1:=draw_E1+1 else if sqr(drop_per_het-true_drop_per_het) > sqr(est_drop_per_het-true_drop_per_het) then ml_wins_E1:=ml_wins_E1+1 else ref_wins_E1:=ref_wins_E1+1; if abs(false_per_pcr-est_false_per_pcr)< (0.05*((false_per_pcr+est_false_per_pcr)/2)) then draw_E2:=draw_E2+1 else if sqr(false_per_pcr-true_false_per_pcr) > sqr(est_false_per_pcr-true_false_per_pcr) then ml_wins_E2:=ml_wins_E2+1 else ref_wins_E2:=ref_wins_E2+1; sum_sqrs_ref_data_drop := sum_sqrs_ref_data_drop + sqr(drop_per_het-true_drop_per_het); sum_sqrs_ML_drop := sum_sqrs_ML_drop + sqr(est_drop_per_het-true_drop_per_het); sum_sqrs_ref_data_false := sum_sqrs_ref_data_false + sqr(false_per_pcr-true_false_per_pcr); sum_sqrs_ML_false := sum_sqrs_ML_false + sqr(est_false_per_pcr-true_false_per_pcr); end; if (see_ml_vs_ref_graph = 1) then begin ref_error1 := drop_per_het; ref_error2 := false_per_pcr; simple_error1 := simple_error1_per_het; simple_error2 := simple_error2_per_pcr; sum_error1 := sum_error1 + error1; sum_error2 := sum_error2 + error2; sum_ref_error1 := sum_ref_error1 + ref_error1; sum_ref_error2 := sum_ref_error2 + ref_error2; sum_per_het_error1 := sum_per_het_error1 + est_drop_per_het; sum_per_pcr_error2 := sum_per_pcr_error2 + est_false_per_pcr; sum_simple_error1 := sum_simple_error1 + simple_error1; sum_simple_error2 := sum_simple_error2 + simple_error2; sum_sample_error1 := sum_sample_error1 + sample_drop_per_het; sum_sample_error2 := sum_sample_error2 + sample_false_per_pcr; sum_sample_error1_per_allele := sum_sample_error1_per_allele + sample_error1_per_allele; sum_sample_error2_per_allele := sum_sample_error2_per_allele + sample_error2_per_allele ; sum_pop_dev_error1 := sum_pop_dev_error1 + (error1 - E1); sum_pop_dev_error2 := sum_pop_dev_error2 + (error2 - E2); sum_sample_dev_error1 := sum_sample_dev_error1 + (error1 - sample_error1_per_allele); sum_sample_dev_error2 := sum_sample_dev_error2 + (error2 - sample_error2_per_allele); sum_sqrs_pop_dev_error1 := sum_sqrs_pop_dev_error1 + sqr(error1 - E1); sum_sqrs_pop_dev_error2 := sum_sqrs_pop_dev_error2 + sqr(error2 - E2); sum_sqrs_sample_dev_error1 := sum_sqrs_sample_dev_error1 + sqr(error1 - sample_error1_per_allele); sum_sqrs_sample_dev_error2 := sum_sqrs_sample_dev_error2 + sqr(error2 - sample_error2_per_allele); sum_sqrs_error1 := sum_sqrs_error1 + sqr(error1); sum_sqrs_error2 := sum_sqrs_error2 + sqr(error2); sum_sqrs_ref_error1 := sum_sqrs_ref_error1 + sqr(ref_error1); sum_sqrs_ref_error2 := sum_sqrs_ref_error2 + sqr(ref_error2); sum_sqrs_per_het_error1 := sum_sqrs_per_het_error1 + sqr(est_drop_per_het); sum_sqrs_per_pcr_error2 := sum_sqrs_per_pcr_error2 + sqr(est_false_per_pcr); sum_sqrs_simple_error1 := sum_sqrs_simple_error1 + sqr(simple_error1); sum_sqrs_simple_error2 := sum_sqrs_simple_error2 + sqr(simple_error2); sum_sqrs_sample_error1 := sum_sqrs_sample_error1 + sqr(sample_drop_per_het); sum_sqrs_sample_error2 := sum_sqrs_sample_error2 + sqr(sample_false_per_pcr); sum_sqrs_sample_error1_per_allele := sum_sqrs_sample_error1_per_allele + sqr(sample_error1_per_allele); sum_sqrs_sample_error2_per_allele := sum_sqrs_sample_error2_per_allele + sqr(sample_error2_per_allele); if see_points=1 then begin Series4.AddXY(est_drop_per_het, est_false_per_pcr,'',clRed); Series5.AddXY(ref_error1,ref_error2,'',clyellow); series18.addXY(simple_error1,simple_error2,'',clblack); Series23.AddXY(sample_drop_per_het, sample_false_per_pcr,'',clyellow); series35.addXY(error1,error2,'',clRed); series43.addXY(sample_error1_per_allele,sample_error2_per_allele,'',clYellow); end; if (log_likelihood(error1,error2)-log_likelihood(sample_error1_per_allele,sample_error2_per_allele)<2.3026) then sample_inside_90CI := sample_inside_90CI+1; if (log_likelihood(error1,error2)-log_likelihood(E1,E2)<2.3026) then pop_inside_90CI := pop_inside_90CI+1; if (log_likelihood(error1,error2)-log_likelihood(sample_error1_per_allele,sample_error2_per_allele)<2.9957) then sample_inside_95CI := sample_inside_95CI+1; if (log_likelihood(error1,error2)-log_likelihood(E1,E2)<2.9957) then pop_inside_95CI := pop_inside_95CI+1; end; He := true_He; end; procedure TtrialForm1.read_data; var i, j, h:integer; begin for i:=1 to 2000 do begin for j:=1 to 100 do begin testarray[i,j]:=0; end; end; using_He_limits:=0; single_locus := 0; which_locus := 1; readln(infile1); {lets the input file have a first line of any text} readln(infile1,noof_loci); h := 0; repeat inc(h); read(infile1, data_He[h]); until eoln(infile1); readln(infile1); if data_He[2]=0 then single_locus := 1; h := 0; repeat inc(h); read(infile1, data_He_stddev[h]); until eoln(infile1); readln(infile1); i:=0; while not eof(infile1) do begin inc(i); j:=0; while not eoln(infile1) do begin inc(j); read(infile1,testarray[i,j]); end; readln(infile1); end; end; procedure TtrialForm1.get_data; {extract individual locus data from testarray and categorise} var i, j, k, h :integer; pcr1_sum, pcr2_sum, allele1, allele2, allele3, allele4, tot_data_obs: integer; begin see_genotypes := 1; see_categories := 0; see_cat_summary := 1; see_exp_freq := 0; see_exp_num := 0; see_walk := 1; see_walk_graph := 1; see_sim_cat := 0; see_data_cat := 1; see_ml_vs_ref := 0; see_ml_vs_ref_graph := 0; if (see_walk_graph = 1) and (using_He_limits = 0) then noof_steps := strtoint(inputbox('Enter number of search steps', 'At least 1000', inttostr(noof_steps))); if (single_locus = 1) and (which_locus > 0) then j := (which_locus-1)*4 else j := 0; h := 0; repeat if single_locus = 0 then inc(h) else h := which_locus; data_cat[1]:=0; data_cat[2]:=0; data_cat[3]:=0; data_cat[4]:=0; data_cat[5]:=0; data_cat[6]:=0; data_cat[7]:=0; data_cat[8]:=0; i := 0; He := data_He[h]; output1.Lines.Add('Data from locus '+inttostr(h)+' loaded. He = ' + floattostr(He)+'.'); repeat inc(i); allele1 := testarray[i,j+1]; allele2 := testarray[i,j+2]; allele3 := testarray[i,j+3]; allele4 := testarray[i,j+4]; {The alleles are so named that every genotype from a single pcr sums to a different value, pcr1_sum and pcr2_sum. This simplifies some of the categorisation rules} if allele1 = 0 then allele1 := allele2; if allele2 = 0 then allele2 := allele1; if allele3 = 0 then allele3 := allele4; if allele4 = 0 then allele4 := allele3; pcr1_sum := allele1 + allele2; pcr2_sum := allele3 + allele4; if (allele1+allele2+allele3+allele4 >= 0) and (using_He_limits = 0) then output1.Lines.Add(inttostr(allele1)+' '+inttostr(allele2)+' '+inttostr(allele3)+' '+inttostr(allele4)); {decision tree to categorise the double pcr reads into the 8 categories} if (pcr1_sum + pcr2_sum) <= 0 then {data_cat[9]:=datacat[9]+1 } else if ((pcr1_sum = 0) or (pcr2_sum = 0)) {counts any double dropout (DD), eg 0012 into category 8} then data_cat[8] := data_cat[8] + 1 else if (pcr1_sum = pcr2_sum) and (allele1 = allele2) and (allele3 = allele4) {counts apparent double homozygotes, eg 1111, into category 1} then data_cat[1] := data_cat[1] + 1 else if (pcr1_sum = pcr2_sum) and (allele1 <> allele2) and ((allele1 = allele3) or (allele1 = allele4)) {counts true heterozygotes, eg 1212, into category 2} then data_cat[2] := data_cat[2] + 1 else if (pcr1_sum <> pcr2_sum) {counts 1222-type errors into category 3} and ((((allele1 = allele2) and (allele3 <> allele4)) and ((allele1 = allele3) or (allele1 = allele4))) or (((allele1 <> allele2) and (allele3 = allele4)) and ((allele3 = allele1) or (allele3 = allele2)))) then data_cat[3] := data_cat[3] + 1 else if (pcr1_sum <> pcr2_sum) and (allele1 = allele2) and (allele3=allele4) and (allele1 <> allele3) {counts 1122-type errors into category 4} then data_cat[4] := data_cat[4] + 1 else if (pcr1_sum <> pcr2_sum) and (allele1 <> allele2) and (allele3 <> allele4) {counts 1814-type errors into category 5} and ((allele1 = allele3) or (allele1 = allele4) or (allele2 = allele3) or (allele2 = allele4)) then data_cat[5] := data_cat[5] + 1 else if ((allele1<>allele3) and (allele1<>allele4) and (allele2<>allele3) and (allele2<>allele4)) {counts 1124-type errors into category 6} and (((allele1 <> allele2) and (allele3 = allele4)) or ((allele1 = allele2) and (allele3 <> allele4))) and ((allele1 <> allele3) and (allele1 <> allele4) and (allele2 <> allele3) and (allele2 <> allele4)) then data_cat[6] := data_cat[6] + 1 else if (allele1 <> allele2) and (allele2 <> allele3) and (allele3 <> allele4) and (allele2 <> allele3) and (allele2 <> allele4) and (allele3 <> allele4) then data_cat[7] := data_cat[7] + 1; until allele1+allele2+allele3+allele4<0; tot_data_obs := data_cat[1]+data_cat[2]+data_cat[3]+data_cat[4]+data_cat[5]+data_cat[6]+data_cat[7]; if see_data_cat = 1 then begin output1.Lines.Add('Double genotype categories for locus '+inttostr(h)); for k := 1 to 8 do output1.Lines.Add('Data number in category ' + inttostr(k) + ': '+inttostr(data_cat[k])); output1.Lines.Add(' ----------------------'); output1.Lines.Add('Sum of observable categories 1-7: '+inttostr(tot_data_obs)); output1.Lines.Add(''); end; sim_or_real:=1; He:=data_He[h]; if using_He_limits=1 then He:=He-1.96*data_He_stddev[which_locus]; if using_He_limits=2 then He:=He+1.96*data_He_stddev[which_locus]; {locus_name:=h;} maxlikelihood; summary.lines.add(''); summary.lines.add('Locus '+inttostr(h)+': Found maximum likelihood error rates using '+INTTOSTR(noof_steps)+' search steps.'); summary.lines.add('Per-allele rates: E1 (allelic dropout) = ' + floattostrf(error1, ffnumber, 8, 6) + ', E2 (false alleles) = ' + floattostrf(error2, ffnumber, 8, 6)); summary.lines.add('Per-genotype rates: p (allelic dropout rate per heterozygote) = ' + floattostrf(est_drop_per_het, ffnumber, 8, 6) + ', f (false allele rate per genotype) = ' + floattostrf(est_false_per_pcr, ffnumber, 8, 6)); j := j+4; if single_locus = 1 then single_locus := single_locus + 1; until (h=noof_loci) {or (testarray[1,j+1] < 0)} or (single_locus >= 2) or (j>=100); end; procedure TtrialForm1.Exit1Click(Sender: TObject); begin Close; end; procedure TtrialForm1.Open1Click(Sender: TObject); begin OpenDialog1.Filter:='Text files (*.txt)|*.txt'; OpenDialog1.Execute; {should put in some kind of if-then procedure to prevent a crash if the user decides to cancel the dialogue box} AssignFile(infile1,OpenDialog1.FileName); label1.caption := ('Using repeat file: '+OpenDialog1.FileName); Reset(infile1); read_data; get_data; end; procedure TtrialForm1.getconfidenceClick(Sender: TObject); begin if (sim_or_real = 1) then begin single_locus := 1; if (noof_loci > 1) then which_locus := strtoint(inputbox('Which locus do you want CIs for?', 'From 1 to '+inttostr(noof_loci), inttostr(which_locus))) else which_locus := 1; if which_locus < 1 then which_locus := 1; if which_locus > noof_loci then which_locus := noof_loci; get_data; series1.clear; series2.clear; series48.clear; series49.clear; series28.clear; series29.clear; confidence_limits; summary.lines.add('E1 and E2 with CIs calculated using estimated He of '+ floattostr(He)); if data_He_stddev[which_locus]>0 then begin single_locus := 1; using_He_limits:=1; get_data; {summary.lines.add('Lower CI He after get_data = '+ floattostr(He)); } confidence_limits; summary.lines.add('E1 and E2 with CIs calculated using lower 95% confidence limit of He = '+ floattostr(He)); {summary.lines.add('Lower CI He after confidence_limits = '+ floattostr(He)); } single_locus := 1; using_He_limits:=2; get_data; confidence_limits; summary.lines.add('E1 and E2 with CIs calculated using upper 95% confidence limit of He = '+ floattostr(He)); using_He_limits:=0; end; end else begin maxlikelihood; series1.clear; series2.clear; series48.clear; series49.clear; series28.clear; series29.clear; confidence_limits; end; output1.Lines.Add(''); output1.Lines.Add('Confidence limits calculated. Click "CI Plot" tab.'); end; procedure TtrialForm1.confidence_limits; var E1_x, E2_y, angle_rad, step, max_lhood, spoke_lhood, old_spoke_lhood, likelihood_drop, likelihood_drop_step: real; angle_deg, angle_deg_inc: integer; ninety_plotted: boolean; res, remainder, divisor: word; upper_95CI_E2, upper_95CI_E1, lower_95CI_E2, lower_95CI_E1: real; begin angle_deg := 0; upper_95CI_E1:=0; lower_95CI_E1:=0; upper_95CI_E2:=0; lower_95CI_E2:=0; {step := max(min(error1,error2)/1000,0.00000001);} max_lhood := log_likelihood(error1, error2); if using_He_limits=0 then Series2.AddXY(error1,error2,'',clred); if using_He_limits=1 then Series48.AddXY(error1,error2,'',clblack); if using_He_limits=2 then Series49.AddXY(error1,error2,'',clblue); if sim_or_real = 0 then begin Series28.addXY(E1,E2,'',clblue); series29.addXY(sample_error[1]/(4*noof_samples), sample_error[2]/(4*noof_samples),'',clblue); He := sim_He; end; output1.Lines.Add(''); output1.Lines.Add('Calculating confidence intervals. Please wait.'); angle_deg_inc:=2; divisor:=90; { if min(error1,error2)<0.0000005 then begin angle_deg_inc:=15; divisor:=15; output1.Lines.Add('Warning! At least one error estimate is zero. CI calculation will be slow.'); end; } repeat step := max(min(error1,error2)/1000,0.0000001); angle_rad := PI * angle_deg / 180; E1_x := error1; E2_y := error2; ninety_plotted:=false; repeat old_spoke_lhood := log_likelihood(E1_x, E2_y); E1_x := E1_x+(step*sin(angle_rad)); E2_y := E2_y+(step*cos(angle_rad)); {if E1_x <= 0 then E1_x := 0.0000001; if E1_x >= 1 then E1_x := 1-0.0000001; if E2_y <= 0 then E2_y := 0.0000001; if E2_y >= 1 then E2_y := 1-0.0000001; } spoke_lhood := log_likelihood(E1_x, E2_y); likelihood_drop_step := old_spoke_lhood - spoke_lhood; if likelihood_drop_step > 0.005 then step:=step/10; {if likelihood_drop_step > 0.005 then output1.Lines.Add('Step/10 = '+floattostr(step)+' Ldrop='+floattostr(likelihood_drop_step));} if likelihood_drop_step < 0.001 then step:=step*10; {if likelihood_drop_step < 0.001 then output1.Lines.Add('Step*10 = '+floattostr(step)+' Ldrop='+floattostr(likelihood_drop_step));} likelihood_drop := max_lhood - spoke_lhood; if (likelihood_drop >= 2.3026) and (ninety_plotted=false) then begin {if angle_deg=45 then summary.Lines.Add('Upper 90% confidence limit: E1 ='+floattostrf(E1_x,ffnumber,8,6)+', E2 = '+floattostrf(E2_y,ffnumber,8,6)); if angle_deg=225 then summary.Lines.Add('Lower 90% confidence limit: E1 ='+floattostrf(E1_x,ffnumber,8,6)+', E2 = '+floattostrf(E2_y,ffnumber,8,6)); } if using_He_limits=0 then Series1.AddXY(E1_x,E2_y,'',clRed); if using_He_limits=1 then Series1.AddXY(E1_x,E2_y,'',clblack); if using_He_limits=2 then Series1.AddXY(E1_x,E2_y,'',clblue); ninety_plotted:=true; end; until (likelihood_drop >= 2.9957) or (E1_x<=0.000000001) or (E1_x>=1-0.000000001) or (E2_y<=0.000000001) or (E2_y>=1-0.000000001); if angle_deg=0 then upper_95CI_E2 := E2_y; if angle_deg=90 then upper_95CI_E1 := E1_x; if angle_deg=180 then lower_95CI_E2 := E2_y; if angle_deg=270 then lower_95CI_E1 := E1_x; if using_He_limits=0 then Series1.AddXY(E1_x,E2_y,'',clRed); if using_He_limits=1 then Series1.AddXY(E1_x,E2_y,'',clblack); if using_He_limits=2 then Series1.AddXY(E1_x,E2_y,'',clblue); {output1.Lines.add('Spoke lhood 95%CI= '+floattostr(spoke_lhood)+' Deg = '+inttostr(angle_deg)+' E1_x='+floattostr(E1_x)+' E2_y='+floattostr(E2_y)+' step='+floattostr(step)+' lhd drop='+floattostr(likelihood_drop)); } inc(angle_deg,angle_deg_inc); divmod(angle_deg,divisor,res,remainder); if remainder=0 then output1.Lines.Add(floattostrf(angle_deg*10/36,ffnumber,3,0)+'% completed'); until angle_deg = 360; summary.Lines.Add(''); summary.Lines.Add('E1 95% confidence interval: '+floattostrf(lower_95CI_E1,ffnumber,8,6)+', '+floattostrf(upper_95CI_E1,ffnumber,8,6)); summary.Lines.Add('E2 95% confidence interval: '+floattostrf(lower_95CI_E2,ffnumber,8,6)+', '+floattostrf(upper_95CI_E2,ffnumber,8,6)); if He=sim_He then He:=true_He; end; procedure TtrialForm1.OpenHfile1Click(Sender: TObject); begin OpenDialog1.Filter:='Text files (*.txt)|*.txt'; OpenDialog1.Execute; {should put in some kind of if-then procedure to prevent a crash if the user decides to cancel the dialogue box} AssignFile(infile2,OpenDialog1.FileName); Reset(infile2); label2.caption := ('Using He file: '+OpenDialog1.FileName); read_He_data; analyse_He_data; end; procedure TtrialForm1.read_He_data; var i, j:integer; begin for i:=1 to 4000 do begin for j:=1 to 50 do begin He_array[i,j]:=0; end; end; readln(infile2); {lets the input file have a first line of any text} readln(infile2,noof_He_loci); i:=0; while not eof(infile2) do begin inc(i); j:=0; while not eoln(infile2) do begin inc(j); read(infile2,He_array[i,j]); end; readln(infile2); end; HeOutput.Lines.Add('Loaded He file of '+inttostr(noof_He_loci)+' loci.'); end; procedure TtrialForm1.analyse_He_data; var i,j,bp,locus_name: integer; count_allele:array[1..25,0..999] of integer; freq_alleles:array[1..25,0..999] of real; total_alleles,noof_alleles: integer_vector; He_variance, He_stddev, est_He: real_vector; sumpisq,sumpicu: real; begin for locus_name:=1 to 25 do begin for bp:=0 to 999 do begin count_allele[locus_name,bp]:=0; total_alleles[locus_name]:=0; noof_alleles[locus_name]:=0; end; end; locus_name:=0; repeat inc(locus_name); j:=locus_name*2-1; i:=0; sumpisq:=0; sumpicu:=0; repeat inc(i); for bp:=0 to 999 do begin if He_array[i,j]=bp then inc(count_allele[locus_name,bp]); if (He_array[i,j]=bp) and (bp > 0) then inc(total_alleles[locus_name]); if He_array[i,j+1]=bp then inc(count_allele[locus_name,bp]); if (He_array[i,j+1]=bp) and (bp > 0) then inc(total_alleles[locus_name]); end; until He_array[i,j]<0; HeOutput.Lines.Add(''); HeOutput.Lines.Add('Allele counts for locus '+inttostr(locus_name)); for bp := 1 to 999 do begin if count_allele[locus_name,bp]>0 then begin inc(noof_alleles[locus_name]); freq_alleles[locus_name,bp]:=count_allele[locus_name,bp]/total_alleles[locus_name]; sumpisq:=sumpisq+sqr(freq_alleles[locus_name,bp]); sumpicu:=sumpicu+power(freq_alleles[locus_name,bp],3); HeOutput.Lines.Add('Allele '+inttostr(bp)+' Count: '+inttostr(count_allele[locus_name,bp])+' Frequency: '+floattostrf(freq_alleles[locus_name,bp],ffnumber,8,4)); end; end; HeOutput.lines.Add('Total number of alleles counted: '+inttostr(total_alleles[locus_name])); HeOutput.lines.Add('There are '+inttostr(noof_alleles[locus_name])+' alleles at locus '+inttostr(locus_name)+'.'); {The next 2 lines calculate He and its variance (unbiased). Variance calc according to Nei (1978) Genetics 89:583-590 equation 10.} est_He[locus_name]:=total_alleles[locus_name]*(1-sumpisq)/(total_alleles[locus_name]-1); He_variance[locus_name]:=(1/((total_alleles[locus_name]/2)*(total_alleles[locus_name]-1)))*(sumpisq-power(sumpisq,2)+2*(total_alleles[locus_name]-2)*(sumpicu-power(sumpisq,2))); {the next two lines give alternative (biased) estimate of He and var, according to Nei & Roychoudhury (1974) Genetics 76:379} {est_He[locus_name]:=1-sumpisq; He_variance[locus_name]:=(2*(total_alleles[locus_name]-1)/power(total_alleles[locus_name],3))*((3-2*total_alleles[locus_name])*power(sumpisq,2)+2*(total_alleles[locus_name]-2)*sumpicu+sumpisq); } He_stddev[locus_name]:=sqrt(He_variance[locus_name]); HeOutput.lines.Add('He = '+floattostrf(est_He[locus_name],ffnumber,8,4)); HeOutput.lines.Add('Var of He = '+floattostrf(He_variance[locus_name],ffnumber,8,8)); HeOutput.lines.Add('SE of He = '+floattostrf(He_stddev[locus_name],ffnumber,8,8)); HeOutput.lines.Add('95% CI of He = '+floattostrf(est_He[locus_name]-1.96*He_stddev[locus_name],ffnumber,8,4)+', '+floattostrf(est_He[locus_name]+1.96*He_stddev[locus_name],ffnumber,8,4)); HeOutput.lines.Add(''); until (locus_name=noof_He_loci); HeOutput.lines.Add(''); HeOutput.lines.Add('He:'); for locus_name:=1 to noof_He_loci do begin HeOutput.lines.add(floattostrf(est_He[locus_name],ffnumber,8,8)); end; HeOutput.lines.Add(''); HeOutput.lines.Add('Standard error:'); for locus_name:=1 to noof_He_loci do begin HeOutput.lines.add(floattostrf(He_stddev[locus_name],ffnumber,8,8)); end; HeOutput.lines.Add(''); HeOutput.lines.Add('Unbiased He, variance and standard error estimated according to Nei (1978) Genetics 89:583-590.'); HeOutput.lines.Add(''); end; end.