-
+ 8802BDF2CCDB11A2618988AB2791E89185379A02892642599CD00559C57C383AD85BFA5DAA5261508372143E5FF34054010E72C01D52063CDA88F4D3F97CCBBA
vtools/lib/diffseq.h
(0 . 0)(1 . 478)
115
116 /* The basic idea is to consider two vectors as similar if, when
117 transforming the first vector into the second vector through a
118 sequence of edits (inserts and deletes of one element each), this
119 sequence is short - or equivalently, if the ordered list of
120 elements that are untouched by these edits is long. For a good
121 introduction to the subject, read about the ``Levenshtein
122 distance'' in Wikipedia.
123
124 The basic algorithm is described in: ``An O(ND) Difference
125 Algorithm and its Variations'', Eugene W. Myers, Algorithmica
126 Vol. 1, 1986, pp. 251-266,
127 \url{http://dx.doi.org/10.1007/BF01840446}. See especially section
128 4.2, which describes the variation used below.
129
130 The basic algorithm was independently discovered as described in:
131 ``Algorithms for Approximate String Matching'', Esko Ukkonen,
132 Information and Control Vol. 64, 1985, pp. 100-118,
133 \url{http://dx.doi.org/10.1016/S0019-9958(85)80046-2}.
134
135 Unless the |find_minimal| flag is set, this code uses the
136 |TOO_EXPENSIVE| heuristic, by Paul Eggert, to limit the cost to
137 $O(N^1.5 \log N)$ at the price of producing suboptimal output for
138 large inputs with many differences. */
139
140 /* Before including this file, you need to define:
141 |ELEMENT| The element type of the vectors being compared.
142 |EQUAL| A two-argument macro that tests two elements for
143 equality.
144 |OFFSET| A signed integer type sufficient to hold the
145 difference between two indices. Usually
146 something like |ptrdiff_t|.
147 |EXTRA_CONTEXT_FIELDS| Declarations of fields for 'struct context'.
148 |NOTE_DELETE(ctxt, xoff)| Record the removal of the object xvec[xoff].
149 |NOTE_INSERT(ctxt, yoff)| Record the insertion of the object yvec[yoff].
150 |EARLY_ABORT(ctxt)| (Optional) A boolean expression that triggers an
151 early abort of the computation.
152 |USE_HEURISTIC| (Optional) Define if you want to support the
153 heuristic for large vectors.
154 It is also possible to use this file with abstract arrays. In this case,
155 xvec and yvec are not represented in memory. They only exist conceptually.
156 In this case, the list of defines above is amended as follows:
157 |ELEMENT| Undefined.
158 |EQUAL| Undefined.
159 |XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)|
160 A three-argument macro: References xvec[xoff] and
161 yvec[yoff] and tests these elements for equality.
162 Before including this file, you also need to include:
163 |#include <limits.h>
164 #include <stdbool.h>|
165 */
166
167 /* Maximum value of type |OFFSET|. */
168 #define OFFSET_MAX \
169 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
170
171 /* Default to no early abort. */
172 #ifndef EARLY_ABORT
173 # define EARLY_ABORT(ctxt) false
174 #endif
175
176 /* Use this to suppress gcc's ``...may be used before initialized''
177 warnings. Beware: The Code argument must not contain commas. */
178 #ifndef IF_LINT
179 # if defined GCC_LINT || defined lint
180 # define IF_LINT(Code) Code
181 # else
182 # define IF_LINT(Code) /* empty */
183 # endif
184 #endif
185
186 /* As above, but when Code must contain one comma. */
187 #ifndef IF_LINT2
188 # if defined GCC_LINT || defined lint
189 # define IF_LINT2(Code1, Code2) Code1, Code2
190 # else
191 # define IF_LINT2(Code1, Code2) /* empty */
192 # endif
193 #endif
194
195 /*
196 * Context of comparison operation.
197 */
198 struct context {
199 #ifdef ELEMENT
200 /* Vectors being compared. */
201 ELEMENT const *xvec;
202 ELEMENT const *yvec;
203 #endif
204
205 /* Extra fields. */
206 EXTRA_CONTEXT_FIELDS
207
208 /* Vector, indexed by diagonal, containing 1 + the X coordinate of
209 the point furthest along the given diagonal in the forward
210 search of the edit matrix. */
211 OFFSET *fdiag;
212
213 /* Vector, indexed by diagonal, containing the X coordinate of the
214 point furthest along the given diagonal in the backward search
215 of the edit matrix. */
216 OFFSET *bdiag;
217
218 #ifdef USE_HEURISTIC
219 /* This corresponds to the |diff --speed-large-files| flag. With
220 this heuristic, for vectors with a constant small density of
221 changes, the algorithm is linear in the vector size. */
222 bool heuristic;
223 #endif
224
225 /* Edit scripts longer than this are too expensive to compute. */
226 OFFSET too_expensive;
227
228 /* Snakes bigger than this are considered "big". */
229 #define SNAKE_LIMIT 20
230 };
231
232 struct partition {
233 /* Midpoints of this partition. */
234 OFFSET xmid;
235 OFFSET ymid;
236
237 /* True if low half will be analyzed minimally. */
238 bool lo_minimal;
239
240 /* Likewise for high half. */
241 bool hi_minimal;
242 };
243
244
245 /* Find the midpoint of the shortest edit script for a specified
246 portion of the two vectors.
247
248 Scan from the beginnings of the vectors, and simultaneously from
249 the ends, doing a breadth-first search through the space of
250 edit-sequence. When the two searches meet, we have found the
251 midpoint of the shortest edit sequence.
252
253 If |find_minimal| is true, find the minimal edit script regardless
254 of expense. Otherwise, if the search is too expensive, use
255 heuristics to stop the search and report a suboptimal answer.
256
257 Set |part->(xmid,ymid)| to the midpoint |(xmid,ymid)|. The
258 diagonal number |xmid - ymid| equals the number of inserted
259 elements minus the number of deleted elements (counting only
260 elements before the midpoint).
261
262 Set |part->lo_minimal| to true iff the minimal edit script for the
263 left half of the partition is known; similarly for
264 |part->hi_minimal|.
265
266 This function assumes that the first elements of the specified
267 portions of the two vectors do not match, and likewise that the
268 last elements do not match. The caller must trim matching elements
269 from the beginning and end of the portions it is going to specify.
270
271 If we return the "wrong" partitions, the worst this can do is cause
272 suboptimal diff output. It cannot cause incorrect diff output. */
273
274 static void
275 diag(OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
276 struct partition *part, struct context *ctxt) {
277 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
278 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
279 #ifdef ELEMENT
280 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
281 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
282 #define XREF_YREF_EQUAL(x, y) EQUAL (xv[x], yv[y])
283 #else
284 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
285 #endif
286 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
287 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
288 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
289 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
290 OFFSET fmin = fmid;
291 OFFSET fmax = fmid; /* Limits of top-down search. */
292 OFFSET bmin = bmid;
293 OFFSET bmax = bmid; /* Limits of bottom-up search. */
294 OFFSET c; /* Cost. */
295 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
296 diagonal with respect to the northwest. */
297
298 fd[fmid] = xoff;
299 bd[bmid] = xlim;
300
301 for (c = 1;; ++c) {
302 OFFSET d; /* Active diagonal. */
303 bool big_snake = false;
304
305 /* Extend the top-down search by an edit step in each diagonal. */
306 if (fmin > dmin)
307 fd[--fmin - 1] = -1;
308 else
309 ++fmin;
310 if (fmax < dmax)
311 fd[++fmax + 1] = -1;
312 else
313 --fmax;
314 for (d = fmax; d >= fmin; d -= 2) {
315 OFFSET x;
316 OFFSET y;
317 OFFSET tlo = fd[d - 1];
318 OFFSET thi = fd[d + 1];
319 OFFSET x0 = tlo < thi ? thi : tlo + 1;
320
321 for (x = x0, y = x0 - d;
322 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
323 x++, y++)
324 continue;
325 if (x - x0 > SNAKE_LIMIT)
326 big_snake = true;
327 fd[d] = x;
328 if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
329 part->xmid = x;
330 part->ymid = y;
331 part->lo_minimal = part->hi_minimal = true;
332 return;
333 }
334 }
335
336 /* Similarly extend the bottom-up search. */
337 if (bmin > dmin)
338 bd[--bmin - 1] = OFFSET_MAX;
339 else
340 ++bmin;
341 if (bmax < dmax)
342 bd[++bmax + 1] = OFFSET_MAX;
343 else
344 --bmax;
345 for (d = bmax; d >= bmin; d -= 2) {
346 OFFSET x;
347 OFFSET y;
348 OFFSET tlo = bd[d - 1];
349 OFFSET thi = bd[d + 1];
350 OFFSET x0 = tlo < thi ? tlo : thi - 1;
351
352 for (x = x0, y = x0 - d;
353 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
354 x--, y--)
355 continue;
356 if (x0 - x > SNAKE_LIMIT)
357 big_snake = true;
358 bd[d] = x;
359 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
360 part->xmid = x;
361 part->ymid = y;
362 part->lo_minimal = part->hi_minimal = true;
363 return;
364 }
365 }
366
367 if (find_minimal)
368 continue;
369
370 #ifdef USE_HEURISTIC
371 /* Heuristic: check occasionally for a diagonal that has made
372 lots of progress compared with the edit distance. If we
373 have any such, find the one that has made the most progress
374 and return it as if it had succeeded.
375
376 With this heuristic, for vectors with a constant small
377 density of changes, the algorithm is linear in the vector
378 size. */
379
380 if (200 < c && big_snake && ctxt->heuristic) {
381 {
382 OFFSET best = 0;
383
384 for (d = fmax; d >= fmin; d -= 2) {
385 OFFSET dd = d - fmid;
386 OFFSET x = fd[d];
387 OFFSET y = x - d;
388 OFFSET v = (x - xoff) * 2 - dd;
389
390 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
391 if (v > best
392 && xoff + SNAKE_LIMIT <= x && x < xlim
393 && yoff + SNAKE_LIMIT <= y && y < ylim) {
394 /* We have a good enough best diagonal;
395 now insist that it end with a
396 significant snake. */
397 int k;
398
399 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
400 if (k == SNAKE_LIMIT) {
401 best = v;
402 part->xmid = x;
403 part->ymid = y;
404 break;
405 }
406 }
407 }
408 }
409 if (best > 0) {
410 part->lo_minimal = true;
411 part->hi_minimal = false;
412 return;
413 }
414 }
415
416 {
417 OFFSET best = 0;
418
419 for (d = bmax; d >= bmin; d -= 2) {
420 OFFSET dd = d - bmid;
421 OFFSET x = bd[d];
422 OFFSET y = x - d;
423 OFFSET v = (xlim - x) * 2 + dd;
424
425 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
426 if (v > best
427 && xoff < x && x <= xlim - SNAKE_LIMIT
428 && yoff < y && y <= ylim - SNAKE_LIMIT) {
429 /* We have a good enough best diagonal;
430 now insist that it end with a
431 significant snake. */
432 int k;
433
434 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
435 if (k == SNAKE_LIMIT - 1) {
436 best = v;
437 part->xmid = x;
438 part->ymid = y;
439 break;
440 }
441 }
442 }
443 }
444 if (best > 0) {
445 part->lo_minimal = false;
446 part->hi_minimal = true;
447 return;
448 }
449 }
450 }
451 #endif
452
453 /* Heuristic: if we've gone well beyond the call of duty, give
454 up and report halfway between our best results so far. */
455 if (c >= ctxt->too_expensive) {
456 OFFSET fxybest;
457 OFFSET fxbest IF_LINT (= 0);
458 OFFSET bxybest;
459 OFFSET bxbest IF_LINT (= 0);
460
461 /* Find forward diagonal that maximizes |X + Y|. */
462 fxybest = -1;
463 for (d = fmax; d >= fmin; d -= 2) {
464 OFFSET x = MIN (fd[d], xlim);
465 OFFSET y = x - d;
466 if (ylim < y) {
467 x = ylim + d;
468 y = ylim;
469 }
470 if (fxybest < x + y) {
471 fxybest = x + y;
472 fxbest = x;
473 }
474 }
475
476 /* Find backward diagonal that minimizes |X + Y|. */
477 bxybest = OFFSET_MAX;
478 for (d = bmax; d >= bmin; d -= 2) {
479 OFFSET x = MAX (xoff, bd[d]);
480 OFFSET y = x - d;
481 if (y < yoff) {
482 x = yoff + d;
483 y = yoff;
484 }
485 if (x + y < bxybest) {
486 bxybest = x + y;
487 bxbest = x;
488 }
489 }
490
491 /* Use the better of the two diagonals. */
492 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
493 part->xmid = fxbest;
494 part->ymid = fxybest - fxbest;
495 part->lo_minimal = true;
496 part->hi_minimal = false;
497 } else {
498 part->xmid = bxbest;
499 part->ymid = bxybest - bxbest;
500 part->lo_minimal = false;
501 part->hi_minimal = true;
502 }
503 return;
504 }
505 }
506 #undef XREF_YREF_EQUAL
507 }
508
509
510 /* Compare in detail contiguous subsequences of the two vectors which
511 are known, as a whole, to match each other.
512
513 The subsequence of vector 0 is |[xoff, xlim)| and likewise for
514 vector 1.
515
516 Note that |xlim, ylim| are exclusive bounds. All indices into the
517 vectors are origin-0.
518
519 If |find_minimal|, find a minimal difference no matter how
520 expensive it is.
521
522 The results are recorded by invoking |note_delete| and
523 |note_insert|.
524
525 Return false if terminated normally, or true if terminated through
526 early abort. */
527
528 static bool
529 compareseq(OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
530 bool find_minimal, struct context *ctxt) {
531 #ifdef ELEMENT
532 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
533 ELEMENT const *yv = ctxt->yvec;
534 #define XREF_YREF_EQUAL(x, y) EQUAL (xv[x], yv[y])
535 #else
536 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
537 #endif
538
539 /* Slide down the bottom initial diagonal. */
540 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff)) {
541 xoff++;
542 yoff++;
543 }
544
545 /* Slide up the top initial diagonal. */
546 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1)) {
547 xlim--;
548 ylim--;
549 }
550
551 /* Handle simple cases. */
552 if (xoff == xlim)
553 while (yoff < ylim) {
554 NOTE_INSERT (ctxt, yoff);
555 if (EARLY_ABORT (ctxt))
556 return true;
557 yoff++;
558 }
559 else if (yoff == ylim)
560 while (xoff < xlim) {
561 NOTE_DELETE (ctxt, xoff);
562 if (EARLY_ABORT (ctxt))
563 return true;
564 xoff++;
565 }
566 else {
567 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
568
569 /* Find a point of correspondence in the middle of the vectors. */
570 diag(xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
571
572 /* Use the partitions to split this problem into subproblems. */
573 if (compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
574 return true;
575 if (compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
576 return true;
577 }
578
579 return false;
580 #undef XREF_YREF_EQUAL
581 }
582
583 #undef ELEMENT
584 #undef EQUAL
585 #undef OFFSET
586 #undef EXTRA_CONTEXT_FIELDS
587 #undef NOTE_DELETE
588 #undef NOTE_INSERT
589 #undef EARLY_ABORT
590 #undef USE_HEURISTIC
591 #undef XVECREF_YVECREF_EQUAL
592 #undef OFFSET_MAX