/* Generate initial candidates for multiscale pairing */ /* Last edited on 2015-01-20 16:46:23 by stolfilocal */ #include #include #include #include #include #include #include #include #include #include #include CONST ExpectedMatchesPerChain == 6; TYPE MismatchOptions == pz_mismatch.Options; Options = struct ??? { char *chainDir; /* Directory where to find chain data. */ char *chainPrefix; /* Invariant chain file name prefix. */ unsigned nChains; /* Number of chains. */ char *output; /* Output candidate file name. */ unsigned band; /* Filtering "band" (wavelength parameter). */ double step; /* Sampling step. */ char *excludeSegs; /* Filename of segments to exclude. */ MismatchOptions mp; /* Mismatch function parameters. */ double minLength; /* Min length of matched segments. */ double blurFactor; /* Corner broadening factor. */ unsigned maxChainCands; /* Max candidates per chain. */ unsigned maxPairCands; /* Max candidates per chain pair. */ unsigned printCands; /* Max candidates to print. */ bool_t verbose; /* TRUE to report candidates as they are found */ } ???; TYPE Segments == pz_segment_vec_t; Chains == ARRAY OF pz_symbol_chain_read_data; VTable == double_vec_t; int main(int argc, char **argv ) { { /* with */ ??? o = pz_get_options(int argc, char **argv); ??? band = o.band; ??? lambda = ((double)band); ??? minCandLength = EnsurePositive(o.minLength - 2.0e0 * o.blurFactor * lambda); ??? sel = pz_proc.SelectAll(o.nChains)^; chAllData = pz_symbol_chain_read_all( o.chainPrefix, band, sel := sel, extension := ".cvc"; dir := o.chainDir, header_only := FALSE ); ??? excludeSegs = ReadExcludedSegsFile(o.excludeSegs)^; ??? dch = pz_symbol_make_decode_table(chAllData.sigmaMin)^; ??? ech = pz_symbol_make_error_var_table(chAllData.sigmaMin)^; cand = ComputeInitialCandidates( chAllData.chData^; dch := dch; ech := ech; mp := o.mp; excludeSegs := excludeSegs; step := o.step; minCandLength := minCandLength; maxChainCands := o.maxChainCands; maxPairCands := o.maxPairCands; verbose := o.verbose )^; /* do */ affirm(chAllData.sigmaMin == chAllData.sigmaMax ); { /* with */ ??? wr = open_write(o.output & ".can", TRUE); /* do */ pz_candidate_write(wr, CandComment(o), cand, lambda); }; PrintCandidates(cand, band, o.printCands);; }; } /* Main */ double EnsurePositive( double minCandLength ) { if (( minCandLength < 0.0e0 )){ fprintf(stderr, "warning: \"-minLength\" too small for this scale\n"); fflush(stderr);; }; return max(0.0e0, minCandLength) } /* EnsurePositive */ Options pz_get_options(int argc, char **argv ) VAR o: Options; { argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_NAME " version " PROG_VERS ", usage:\n" PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); { /* with */ /* do */ TRY argparser_get_keyword(pp, "-chainPrefix"); o.chainPrefix = argparser_get_next(pp); if (( argparser_keyword_present(pp, "-chainDir") )){ o.chainDir = argparser_get_next(pp) }else{ o.chainDir = "."; }; argparser_get_keyword(pp, "-nChains"); o.nChains = argparser_get_next_int(pp); argparser_get_keyword(pp, "-band"); o.band = argparser_get_next_int(pp); argparser_get_keyword(pp, "-step"); o.step = argparser_get_next_double(pp, 0.0e0); argparser_get_keyword(pp, "-output"); o.output = argparser_get_next(pp); o.mp = pz_mismatch.ParseOptions(pp); argparser_get_keyword(pp, "-minLength"); o.minLength = argparser_get_next_double(pp, 0.0e0); argparser_get_keyword(pp, "-blurFactor"); o.blurFactor = argparser_get_next_double(pp, 0.0e0); if (( argparser_keyword_present(pp, "-excludeSegs") )){ o.excludeSegs = argparser_get_next(pp) }else{ o.excludeSegs = ""; }; if (( argparser_keyword_present(pp, "-maxChainCands") )){ o.maxChainCands = argparser_get_next_int(pp, 1) }else{ o.maxChainCands = (unsigned.ne - 1); }; if (( argparser_keyword_present(pp, "-maxPairCands") )){ o.maxPairCands = argparser_get_next_int(pp, 1, o.maxChainCands) }else{ o.maxPairCands = (unsigned.ne - 1); }; if (( argparser_keyword_present(pp, "-printCands") )){ o.printCands = argparser_get_next_int(pp, 1) }else{ o.printCands = 1000; }; o.verbose = argparser_keyword_present(pp, "-verbose"); argparser_finish(pp); EXCEPT | ParseParams.Error ==> fprintf(stderr, "Usage: pz_get_initial_cands \\\n"); fprintf(stderr, " [ -chainDir DIR ] -chainPrefix NAME \\\n"); fprintf(stderr, " -nChains NUMBER \\\n"); fprintf(stderr, " -band NUMBER -step NUMBER \\\n"); fprintf(stderr, " -output NAME \\\n"); fprintf(stderr, " [ -excludeSegs FILENAME ] \\\n"); fprintf(stderr, " -minLength NUMBER -blurFactor NUMBER \\\n"); fprintf(stderr, " %s \\\n", pz_mismatch.OptionsHelp ); fprintf(stderr, " [ -maxChainCands NUMBER ] \\\n"); fprintf(stderr, " [ -maxPairCands NUMBER ] \\\n"); fprintf(stderr, " [ -printCands NUMBER ] \\\n"); fprintf(stderr, " [ -verbose ]\n"); Process.Exit(1);; };; }; return o } /* GetOptions */ pz_candidate.List *ComputeInitialCandidates( Chains *chain, VTable *dch /* Curvature decoding table. */ VTable *ech, /* Quantization noise variances. */ MismatchOptions mp, /* Mismatch formula parameters. */ Segments *excludeSegs, /* List of segments to exclude */ double step, /* Sampling step. */ double minCandLength, /* Min length of candidate */ unsigned maxChainCands, /* Max candidates per chain */ unsigned maxPairCands, /* Max candidates per chain pair */ bool_t verbose, /* Report candidates found */ ) VAR c : REF pz_candidate.List; nCands : unsigned = 0; { { /* with */ ??? nChains = (chain.ne); /* do */ c = NEW(REF pz_candidate.List, nChains * ExpectedMatchesPerChain); for (ka = 0; ka <= nChains-1 ; ka++){ if (( chain[ka].c != NULL )){ { /* with */ ??? a = chain[ka].c^; ??? aMinSteps = FLOOR(minCandLength / step); /* do */ for (kb = ka+1; kb <= nChains-1 ; kb++){ if (( chain[kb].c != NULL )){ { /* with */ ??? b = chain[kb].c^; ??? bMinSteps = FLOOR(minCandLength / step); ??? minSteps = MIN(aMinSteps, bMinSteps); /* do */ fprintf(stderr, "."); ProcessChainPair( a, ka, b, kb, dch = dch, ech = ech, step = step, minSteps = minSteps, mp = mp, excludeSegs = excludeSegs, verbose = verbose, maxPairCands = maxPairCands, c = c, nCands = nCands ); };; };; };; };; }; fprintf(stderr, " %s\n", Fmt.Int(nCands) );; }; fprintf(stderr, Fmt.Int(nCands) & " candidates before chain pruning\n"); pz_candidate_sort(SUBARRAY(c^, 0, nCands), pz_candidate_mismatch_better); pz_candidate_prune_cands_per_chain(c^, nCands, maxChainCands); fprintf(stderr, Fmt.Int(nCands) & " candidates after chain pruning\n"); pz_candidate_sort(SUBARRAY(c^, 0, nCands), pz_candidate_pair_mismatch_better); { /* with */ ??? ct = NEW(REF pz_candidate.List, nCands); /* do */ ct^ = SUBARRAY(c^, 0, nCands); return ct;; };; }; } /* ComputeInitialCandidates */ void ProcessChainPair( pz_symbol_chain_t *a, unsigned ka, pz_symbol_chain_t *b, unsigned kb, VTable *dch, /* Curvature decoding table */ VTable *ech, /* Quantization noise variances. */ double step, /* Nominal step length */ unsigned minSteps, /* Minimum number of steps in candidate. */ MismatchOptions mp, /* Mismatch formula parameters. */ Segments *excludeSegs, /* Segments to exclude. */ unsigned maxPairCands, /* Maximum candidates to create for this pair. */ bool_t verbose, /* Report candidates found */ /*IO*/ REF *pz_candidate.List c, unsigned *nCands ) /* Consider the set of pairs "(ia,ib)" where "ia" is an index in "a" and "ib" is an index in "b". Since the chains are circular, this set is a toroidal integer grid with periods "ma==(a.ne)" and "mb == (b.ne)", respectively. In this program, we only look for candidates that are defined by `perfect' pairings, i.e. pairings without half-steps. Such a pairing is a segment of some diagonal line of the "(ia,ib)" grid. Therefore, we enumerate the diagonals of the toroidal grid, and search for candidates within each diagonal in turn. The number of diagonals is "numDiagonal == GCD(ma,mb)", and each diagonal is a circular sequence of "diagSize == (ma*mb)/nDiags". We number the diagonals from "[0..nDiags-1]" and their elements from [0..diagSize-1]", as follows: element index "r" of diagonal number "t" is the pair "(ia,ib)" where "ia == r MOD ma" and "ib == (t-r) MOD mb". Within each diagonal "t", we compute a vector "d2" such that "d2[r]" is the distance squared between "a[ia]" and "b[ib]", where "ia,ib" are the corresponding chain indices. These distances are clipped TO "mp.maxDistSqr" We also compute a boolean vector "ok", such that "ok[r]" tells whether there is a sequence of "minSteps" consecutive steps beginning with element "r" that could be part of an acceptable candidate. Specifically, "ok[r] == TRUE" iff the average squared differences "d2[]" over the next "minSteps" steps starting with pair "r" is less than "mp.critDistSqr". Call these sequences `minimal candidates.' At the same time, we exclude from the "ok" vector any entries "ok[r]" such that the corresponding pair of samples, and the next "minSteps" samples, overlap a segment listed in "excludeSegs". The candidates are then found by identifying the maximal substrings of consecutive "TRUE"s in the "ok" vector, and checking whether their mismatch, plus "mp.bias", is negative. Note that the average of "d2" is negative within each minimal candidate, hence by merging overlapping minimal candidates we only make the mismatch smaller. */ VAR d2Sum: double; nCandsIni: unsigned = nCands; nCandsPair: unsigned; { { /* with */ ??? ma = (a.ne); ??? mb = (b.ne); ??? nDiags = GCD(ma, mb); ??? diagSize = (ma * mb) DIV nDiags; ??? nOld = nCands + 0; /* Work vectors: */ ??? d2 = double_vec_new(diagSize)^; /* Squared pair distances. */ ??? d2s = double_vec_new(diagSize)^; /* Accumulated "d2[r]". */ ??? ok = bool_vec_new(diagSize)^; ??? exa = bool_vec_new(ma)^; /* Excluded parts of "a". */ ??? exb = bool_vec_new(mb)^; /* Excluded parts of "b". */ /* do */ if (( verbose )){ fprintf(stderr, "chains: " & Fmt.Int(ka) & "[" & Fmt.Int(ma) & "]"); fprintf(stderr, " × "); fprintf(stderr, Fmt.Int(kb) & "[" & Fmt.Int(mb) & "]"); fprintf(stderr, " num diags: " & Fmt.Int(nDiags)); fprintf(stderr, " ns diag: " & Fmt.Int(diagSize)); fprintf(stderr, " minSteps: " & Fmt.Int(minSteps));; }; /* Mark excluded parts of the two chains: */ for (i = 0; i < (exa.ne ) ; i++){ exa[i] = FALSE ;}; ExcludeSamples(ka, excludeSegs, exa, extraL = minSteps, extraR = 0); for (i = 0; i < (exb.ne ) ; i++){ exb[i] = FALSE ;}; ExcludeSamples(kb, excludeSegs, exb, extraL = 0, extraR = minSteps); /* Scan diagonals of the grid: */ for (t = 0; t <= nDiags-1 ; t++){ /* Compute pairwise squared distances "d2[r]". Also accumulates squared distances, so that "d2s[r] == SUM { d2[k] : k IN [0..r] }". */ d2Sum = 0.0e0; for (r = 0; r <= diagSize-1 ; r++){ { /* with */ ??? ia = r MOD ma; ??? ib = (t-r) MOD mb; ??? dab2 = pz_symbol_dist_sqr(a[ia], b[ib]; complement := TRUE, decode := dch, errorVar := ech ); /* do */ d2[r] = min(mp.maxDistSqr, dab2); d2Sum = d2Sum + d2[r]; d2s[r] = d2Sum;; };; }; /* Set "ok[r]" true if the corresponding pair "(ia,ib)" is the beginning of a run of "minSteps" steps whose average squared distance is less than "mp.critDistSqr". */ for (r = 0; r <= diagSize-1 ; r++){ { /* with */ ??? ia = r MOD ma; ??? ib = (t - r) MOD mb; ??? rLo = r; ??? rmLo = rLo MOD diagSize; ??? qLo = ((double)rLo DIV diagSize); ??? s2Lo = d2s[rmLo] + qLo*d2Sum; ??? rHi = r + minSteps; ??? rmHi = rHi MOD diagSize; ??? qHi = ((double)rHi DIV diagSize); ??? s2Hi = d2s[rmHi] + qHi*d2Sum; /* Integral of "d2" by trapezoidal rule: */ ??? d2Intg = (s2Hi - 0.5e0*d2[rmHi]) - (s2Lo - 0.5e0 * d2[rmLo]); ??? nSteps = ((double)rHi - rLo - 1); /* do */ affirm(rHi > rLo ); ok[r] = (d2Intg < nSteps * mp.critDistSqr) ) ANDAND ( (NOT exa[ia]) ) ANDAND ( (NOT exb[ib]); };; }; /* Now collect runs of TRUEs in "ok": */ BreakDiagonal( a, ka, b, kb, d2, ok, c, nCands, tDiag = t, step = step, minSteps = minSteps, mp = mp );; }; nCandsPair = nCands - nCandsIni; { /* with */ ??? cx = SUBARRAY(c^, nCandsIni, nCandsPair); /* do */ pz_candidate_sort(cx, pz_candidate_pair_mismatch_better); pz_candidate_prune_cands_per_pair(cx, nCandsPair, maxPairCands); nCands = nCandsIni + nCandsPair; }; if (( verbose )){ fprintf(stderr, " nCands: " & Fmt.Int(nCands)); fprintf(stderr, " (+%s)\n", Fmt.Int(nCands - nOld) );; };; }; } /* ProcessChainPair */ void BreakDiagonal( pz_symbol_chain_t *a, unsigned ka, pz_symbol_chain_t *b, unsigned kb, double_vec_t *d2, bool_vec_t *ok, REF *pz_candidate.List c, unsigned *nCands, unsigned tDiag /* Index of diagonal */ double step, unsigned minSteps, /* Minimum number of steps in candidate. */ MismatchOptions mp, /* Mismatch formula parameters. */ ) VAR k: unsigned; count, kStart, kIni: unsigned; intgDistSqr: double; nMatched: double; h: double; inx: unsigned; { { /* with */ ??? ma = (a.ne); ??? mb = (b.ne); ??? diagSize = (d2.ne); ??? maxSteps = MIN(ma,mb) - 1; /* do */ affirm(diagSize > 1 ); affirm(diagSize MOD ma == 0 ); affirm(diagSize MOD mb == 0 ); affirm(maxSteps < diagSize ); /* Look for a TRUE element "ok[kStart]" right after a FALSE element: */ k = 0; while ( ok[k MOD diagSize] == TRUE ) ANDAND ( k < diagSize ){ k++ ;}; if (( k == diagSize )){ /* All TRUE */ kStart = 0 }else{ inx = 0; while ( ok[k MOD diagSize] == FALSE ) ANDAND ( inx < diagSize ){ k++; inx++ ;}; if (( inx == diagSize )){ /* All FALSE */ return ;}; kStart = k MOD diagSize;; }; /* Collect runs of TRUEs": */ k = kStart; REPEAT /* Here begins a new run */ kIni = k; count = 0; REPEAT k++; count++; UNTIL ok[k MOD diagSize] == FALSE ) || ( count == diagSize; /* Compute end of candidate: */ { /* with */ ??? nStepsRaw = count - 1 + minSteps; ??? nSteps = MIN(nStepsRaw, maxSteps); ??? excess = nStepsRaw - nSteps; /* do */ kIni = kIni + excess DIV 2; if (( nSteps >= minSteps )){ /* Make room for a new candidate: */ if (( nCands >= (c^.ne) )){ { /* with */ ??? ct = NEW(REF pz_candidate.List, nCands * 2); /* do */ SUBARRAY(ct^, 0, (c^.ne)) = c^; c = ct;; };; }; /* Save this run as a new candidate: */ { /* with */ sa = pz_segment_t{ cvx := ka; tot := ma; ini := kIni MOD ma; ns := nSteps + 1; rev := FALSE }; sb = pz_segment_t{ cvx := kb; tot := mb; ini := (tDiag - kIni - nSteps) MOD mb; ns := nSteps + 1; rev := TRUE }; ??? length = step * ((double)nSteps); /* do */ /* Compute matched length and root mean square distance: */ /* Trapezoidal integration formula: */ intgDistSqr = 0.0e0; nMatched = 0.0e0; for (p = 0; p <= nSteps ; p++){ { /* with */ ??? r = (kIni+p) MOD diagSize; /* do */ if (( p == 0 ) || ( p == nSteps )){ h = 0.5e0 }else{ h = 1.0e0 ;}; intgDistSqr = intgDistSqr + h * d2[r]; if (( d2[r] < mp.maxDistSqr )){ nMatched = nMatched + h ;}; }; }; intgDistSqr = step * intgDistSqr; c[nCands] = pz_candidate_t{ seg = pz_segment_pair{sa, sb}, mismatch = pz_mismatch.Eval( length = length, intgDistSqr = intgDistSqr, totAsymm = 0.0e0, o = mp ), length = length, matchedLength = step * nMatched, pm = NULL }; nCands++;; };; }; }; while ( ok[k MOD diagSize] == FALSE ){ k++ ;}; UNTIL (k MOD diagSize) == kStart; }; } /* BreakDiagonal */ void ExcludeSamples( unsigned cvx /* A chain number. */ Segments *excludeSegs, /* Segments to exclude. */ bool_vec_t *ex, /* OUT: samples to exclude from chain "cvx". */ unsigned extraL, unsigned extraR, /* Extra samples to exclude at each END. */ ) /* Sets "ex[i]" TRUE if and only if there is some excluded segment in "excludeSegs" that covers sample number "j" of chain "cvx", where "j - extraL <= i <= j + extraR". */ { for (is = 0; is < (excludeSegs.ne ) ; is++){ { /* with */ ??? s = excludeSegs[is]; /* do */ if (( cvx == s.cvx )){ affirm((ex.ne) == s.tot ); for (r = -extraL; r <= s.ns + extraR - 1 ; r++){ { /* with */ ??? i = (s.ini + r) MOD s.tot; /* do */ ex[i] = TRUE; }; }; }; }; } } /* ExcludeSamples */ int GCD( int a, int b ) VAR m : unsigned := 1; i : unsigned = 2; { while ( i <= min(a,b) ){ if (( a MOD i == 0 ) ANDAND ( b MOD i == 0 )){ a = a DIV i; b = b DIV i; m = m * i; }else{ i++;; };; }; return m; } /* GCD */ void PrintCandidates ( pz_candidate.List *cand, <*UNUSED*> band: unsigned, unsigned maxCands ) { if (( maxCands == 0 )){ return ;}; for (i = 0; i <= min(maxCands, (cand.ne))-1 ; i++){ { /* with */ ??? c = cand[i]; /* do */ fprintf(stderr, "cand[%s]\n", Fmt.Int(i) ); pz_candidate_print(stderr, c, pairing = FALSE); }; }; fflush(stderr); } /* PrintCandidates */ Segments *ReadExcludedSegsFile( char *segName ) { if (( Text.Empty(segName) )){ return NEW(REF Segments, 0) }else{ { /* with */ ??? fileName = txtcat(segName , ".seg"); /* do */ fprintf(stderr, fileName & "\n"); { /* with */ ??? rd = open_read(fileName, TRUE); ??? sData = pz_segment_read(rd); /* do */ return sData.s; }; }; } } /* ReadExcludedSegsFile */ char *CandComment( Options *o ) { return "PZGetInitialCandidates\n" & " chainDir " & o.chainDir & "\n" & " chainPrefix " & o.chainPrefix & "\n" & " nChains == " & Fmt.Int(o.nChains) & "\n" & " band == " & Fmt.Int(o.band) & "\n" & " maxDist == " & Fmt.LongReal(sqrt(o.mp.maxDistSqr)) & "\n" & " critDist == " & Fmt.LongReal(sqrt(o.mp.critDistSqr)) & "\n" & " excludeSegs == \"" & o.excludeSegs & "\"\n" & " minLength == " & Fmt.LongReal(o.minLength) & "\n" & " blurFactor == " & Fmt.LongReal(o.blurFactor) & "\n" & " maxChainCands == " & Fmt.Int(o.maxChainCands) & "\n" & " maxPairCands == " & Fmt.Int(o.maxPairCands) & "\n" } /* CandComment */ { /* Copyright © 2001 Universidade Estadual de Campinas (UNICAMP). Authors: Helena C. G. Leitão and Jorge Stolfi. This file can be freely distributed, used, and modified, provided that this copyright and authorship notice is preserved, and that any modified versions are clearly marked as such. This software has NO WARRANTY of correctness or applicability for any purpose. Neither the authors nor their employers chall be held responsible for any losses or damages that may result from its use. */