#ifndef frb_match_H #define frb_match_H /* Monotonic pairings between sequences by dynamic programming. */ /* Last edited on 2005-01-16 14:44:58 by stolfi */ /* Copyright © 2005 Universidade Estadual de Campinas. See note at end of file. */ #include #include typedef i2_vec_t frb_match_t; /* A {frb_match_t} is a `monotone matching' between two arbitrary sequences {a[0..NA-1]} and {b[0..NB-1]}, defined as a list of pairs {mt[0..m-1] == (r[0],s[0]),.. (r[m-1],s[m-1])} such that | r[0] >= 0, s[0] >= 0; | | r[m-1] <= NA-1, s[m-1] <= NB-1; | | r[k+1] == r[k] ) || ( r[k+1] == r[k]+1, | s[k+1] == s[k] ) || ( s[k+1] == s[k]+1, | | mt[k+1] != mt[k] for all {k}. By definition, element {a[r[k]] == a[mt[k][0]]} is paired with sample {b[s[k]] == b[mt[k][1]]}, for all {k}. We say that a pair {(r1,s1)} `precedes' another pair {(r2,s2)} if {r1 <= r2} and {s1 <= s2}, and at least one of these inequalities is strict. In that case we also say that {(r2,s2)} `follows' {(r1,s1)}. Note that, in any monotone matching {mt}, {mt[i]} precedes {mt[j]} iff {i < j}. Two consecutive pairs {mt[k]==(r[k],s[k])} and {mt[k+1]==(r[k+1],s[k+1])} are said to be a `step' of the match. The step is said to be `full' if both {r} and {s} increase by 1, and `half' if only one of them increases. A `perfect matching' is one that uses only full steps. The matching is said to be `complete' if {mt[0] == (0,0)} and {mt[m-1] == (NA-1,NB-1)}; and `incomplete' otherwise. */ #define frb_match_new i2_vec_new #define frb_match_expand i2_vec_expand /* MINIMUM-COST MATCHINGS */ /* The procedures in this section take as argument an arbitrary `step cost function' {step_cost_fn} (which may return negative values). The `total cost' of a matching {mt} is the sum of {step_cost_fn(s[k])} over all steps {S[k] == mt[k]->mt[k+1]} of {mt}. The `step count' of a match is defined as the number of half-steps plus twice the number of full steps. It is also the number of samples matched in the two ckains, minus two; thus any complete matching has step count {NA+NB-2}. The optimization procedures also use internally a {CostMatrix} {rCost} with at least {NA} rowns and {NB} columns, which may be preallocated and given by the client. The procedures will (re)allocate said matrix if it is NULL or too small. */ typedef double frb_match_step_function_t ( i2_t *p1, i2_t *p2 ); typedef double_vec_t frb_match_cost_matrix_t; /* Work array used by DP algorithm */ void frb_match_find_best ( int NA, int NB, frb_match_step_function_t step_cost_fn, frb_match_t *mt, /* (OUT) */ double *totCost, /* (OUT) */ frb_match_cost_matrix_t *rCost /* (WORK) */ ); /* Returns in {mt} a complete monotone matching between two strings with {NA} and {NB} elements, with mimimum total cost (which is returned in {totCost}. */ /* UTILITY FUNCTIONS */ frb_match_t *frb_match_trim ( frb_match_t *mt, int start, int count ); /* Returns the segment {mt[start..start+count-1]} of matching {mt}. */ frb_match_t *frb_match_adjust ( frb_match_t *mt, frb_segment_t sOld[], frb_segment_t sNew[] ); /* Given a matching between segments {sOld[0]} and {sOld[1]}, returns the equivalent matching between {sNew[0]} and {sNew[1]}. This entails mapping the relative sample indices in {mt} so that they refer to the same samples, and possibly trimming pairs that fall outside the new segments. The result is NULL if none of the old pairs are contained in the new segments, or the intersection of the restriction of {mt} to the new segments is not a single, continuous matching. */ frb_match_t *frb_match_remove_dangling_ends ( frb_match_t *mt ); /* Returns a new matching obtained by removing the ``dangling ends'' of {mt}, i.e. all initial and final half-steps of {mt} where one of the ``legs'' remains stuck at the initial or final point of the corresponding string. */ frb_match_t frb_match_most_perfect ( int n0, int n1 ); void frb_match_domost_perfect ( int n0, int n1, frb_match_t *mt ); /* Generates a maximally perfect matching between two segments with {n0} and {n1} samples, respectively. The corrresponding path is a Bresenham line between {(0,0)} and {(n0,n1)}. {frb_match_domost_perfect} is like {frb_match_most_perfect} but stores the matching in the client-provided array {mt}, which should have {max(n0,n1)} elements exactly. */ typedef bool frb_match_step_predicate_t ( i2_t *p1, i2_t *p2 ); typedef double frb_match_step_measure_t ( i2_t *p1, i2_t *p2 ); double frb_match_length ( frb_match_t *mt, frb_match_step_measure_t lengthFn ); /* Returns the sum of of {lengthFn(S)} for all steps of the matching {mt}. */ double frb_match_sum ( frb_match_t *mt, frb_match_step_function_t stepFn ); /* Returns the sum of {stepFn(S)} over the matching {mt}. */ double frb_match_integral ( frb_match_t *mt, frb_match_step_measure_t lengthFn, frb_match_step_function_t stepFn ); /* Returns the integral of {stepFn(S)} over the matching {mt}, with respect to step length as returned by {lengthFn(S)}. */ int frb_match_matched_step_count ( frb_match_t *mt, frb_match_step_predicate_t isMatched ); /* Returns the step count of {mt} (number of half-steps plus twice the number of full steps), considering only steps {S} for which {isMatched(S)} is TRUE. */ double frb_match_matched_length ( frb_match_t *mt, frb_match_step_measure_t lengthFn, frb_match_step_predicate_t isMatched ); /* Returns the sum of {lengthFn(S)} for all steps {S} in {mt} for which {isMatched(S)} is true. */ /* PACKED MATCHINGS */ typedef struct frb_match_packed_t { /* A compact description of a matching */ int np; /* Number of pairs (> 0). */ i2_t ini; /* Initial pair */ i2_t fin; /* Final pair */ char *steps; /* Encoded steps */ } frb_match_packed_t; /* Code for encoded steps '/' advance [0], '\\' advance [1], '|' both. */ frb_match_packed_t *frb_match_pack ( frb_match_t *mt ); /* Returns a packed encoding of {mt}. */ frb_match_packed_t *frb_match_pack_perfect ( int nSamples ); /* Returns a packed perfect matching between two segments, both with {nSamples} segments. */ frb_match_t frb_match_unpack ( frb_match_packed_t *pm ); void frb_match_do_unpack ( frb_match_packed_t *pm, frb_match_t *mt ); /* Expands a packed representation into an explicit matching. {frb_match_do_unpack} is like {frb_match_unpack} but returns the result in the client-provided array {mt}. */ void frb_match_write_packed ( FILE *wr, frb_match_packed_t *pm ); /* Writes the matching {pm} in compact form to writer {wr}. */ frb_match_packed_t *frb_match_read_packed ( FILE *rd ); /* Reads from {rd} a packed matching, in the format written by {Write}. */ frb_match_packed_t *frb_match_adjust_packed ( frb_match_packed_t *rpm, frb_segment_t oldSeg[], frb_segment_t newSeg[] ); /* Given a packed matching {rpm^} between two segments {oldSeg[0]} and {oldSeg[1]}, returns a new packed matching that describes the same pairing between the segments {newSeg[0]} and {newSeg[1]}. If the new segments are not contained in the original ones, the matching may be trimmed. The result is NULL if {rpm} is NULL or if the intersection of the new segments and the old pairings is not a single interval. */ void frb_debug_int_pair ( char *title, int r, int s ); /* Prints {title} followed by the inteers {r} and {s}. */ #endif /* COPYRIGHT AND AUTHORSHIP NOTICE Copyright © 2005 Universidade Estadual de Campinas (UNICAMP). Created by Helena C. G. Leitão and Jorge Stolfi in 1995--2005. This source file can be freely distributed, used, and modified, provided that this copyright and authorship notice is preserved in all copies, and that any modified versions of this file are clearly marked as such. This software has NO WARRANTY of correctness or applicability for any purpose. Neither the authors nor their employers shall be held responsible for any losses or damages that may result from its use. END OF NOTICE */