/* **++ ** FACILITY: DIFF.C ** ** MODULE DESCRIPTION: ** ** This module compares two arrays of lines (representing ** files) and reports the sequences of consecutive matching ** lines between them using the "recursive longest matching ** sequence" algorithm. ** ** AUTHORS: ** Steppe, Tom. "File Comparison Algorithms." Dr. Dobb's ** Journal, #131 (Sep 1987): 54-60. ** ** CREATION DATE: 2-21-1991 ** ** ** MODIFICATION HISTORY: ** ** {@tbs@}... **-- */ /* ** ** INCLUDE FILES ** */ #include #include #include #include #include #include "diff.h" /* ** ** MACRO DEFINITIONS ** */ #define TRUE 1 #define FALSE 0 /** Maximum and Minimum macro **/ #define max(x, y) (((x) > (y)) ? (x) : (y)) #define min(x, y) (((x) < (y)) ? (x) : (y)) /** Value to indicate identical strings with strcmp **/ #define ALIKE 0 /** Mask for number of bits in hash code (12 bits) **/ #define MASK (unsigned int) 0x0FFFF /** Number of possible hash codes **/ #define HASHSIZ (MASK+1) /* ** ** GLOBAL DECLARATIONS ** **/ int diff_sec, /** Number of diff sections **/ diff_line; /** Number of diff lines **/ int diff_beg1, /** First line# for diff sec in file 1 **/ diff_beg2; /** First line# for diff sec in file 2 **/ /* ** ** INTERNAL FUNCTION PROTOTYPE DECLARATIONS ** **/ static BOOLEAN get_inf (char *, TXTINF *, int, FILEINF *); static HASH calc_hash (TXTINF); static void fnd_seq (FILEINF *, int, int, FILEINF *, int, int, int); static BOOLEAN chk_hashes (LINEINF *, LINEINF *, int); static int cnt_matches (TXTINF *, TXTINF *, int); static void rpt_seq (FILEINF *, int, FILEINF *, int, int); static void rpt_seq_1 (FILEINF *, int, int, FILEINF *, int, int); /* **++ ** FUNCTIONAL DESCRIPTION: ** ** compare() compares two arrays of lines and reports the ** sequences of consecutive matching lines. The zeroth ** element of each array is unused so that the index into ** the array is identical to the associated line number. ** ** RETURN VALUE: ** ** TRUE if comparison succeeded. ** FALSE if not enough memory. ** **-- */ BOOLEAN compare (r1, a1, n1, r2, a2, n2, lngval) char *r1; /** File name for #1 **/ TXTINF *a1; /** Array of lines of text in #1 **/ int n1; /** Number of lines in a1 (does not count 0th element) **/ char *r2; /** File name for #2 **/ TXTINF *a2; /** Array of lines of text in #2 **/ int n2; /** Number of lines in a2 (does not count 0th element) **/ int lngval; /** "Long enough" value **/ { FILEINF f1, /** File info for #1 **/ f2; /** File info for #2 **/ BOOLEAN rtn; /** Return value **/ /** Initialize **/ diff_sec = diff_line = 0; diff_beg1 = diff_beg2 = 0; /** Gather info for each file, then compare **/ if (rtn = (get_inf (r1, a1, n1, &f1) && get_inf (r2, a2, n2, &f2))) { fnd_seq (&f1, 1, n1, &f2, 1, n2, lngval); } /** Output remaining difference lines **/ if (!(diff_beg1 == n1 && diff_beg2 == n2)) { rpt_seq_1 (&f1, n1, n1-diff_beg1, &f2, n2, n2-diff_beg2); diff_line++; } /** Output summary report **/ fprintf (stdout, "\nNumber of difference sections found: %d\n", diff_sec); fprintf (stdout, "Number of difference records found: %d\n", diff_line); /** Return dynamic memory to pool **/ free (f1.line); free (f1.hashtbl); free (f2.line); free (f2.hashtbl); return rtn; } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** get_inf() calculates hash codes and builds a hash table ** ** ** RETURN VALUE: ** ** TRUE if get_inf() succeeded. ** FALSE if not enough memory. ** **-- */ static BOOLEAN get_inf (r, a, n, f) char *r; /** File name **/ TXTINF *a; /** Array of lines of text **/ int n; /** Number of lines in a **/ FILEINF *f; /** File info **/ { unsigned int size; /** Size of hash table **/ register int i; /** Counter **/ TBLENTRY *entry; /** Entry in hash table **/ f->filename = r; /** Assign the file name **/ f->txt = a; /** Assign the array of text **/ /** Allocate and initialize a hash table **/ size = HASHSIZ * sizeof (TBLENTRY); if (f->hashtbl = (TBLENTRY *) malloc (size)) { memset ((char *) f->hashtbl, '\0', size); } else { return (FALSE); } if (n <= 0) { /** There is no lines **/ f->line = NULL; } else { /** Allocate an array of line structs **/ if ((f->line = (LINEINF *) malloc ((n+1) * sizeof (LINEINF))) == NULL) { return (FALSE); } for (i = 1; i <= n; i++) { /** Loop through the lines **/ f->line[i].hash = calc_hash (f->txt[i]); /** Calculate hash **/ entry = f->hashtbl + f->line[i].hash; /** Locate entry **/ /** Update the linked list of lines iwth the same hash code **/ f->line[entry->last].nxtln = i; f->line[i].nxtln = 0; /** Update the first and last line info in the hash table **/ if (entry->frst == 0) { entry->frst = i; } entry->last = i; } } return (TRUE); } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** calc_hash() calculates a hash code for a line of text. ** ** RETURN VALUE: ** ** a hash code value. ** **-- */ static HASH calc_hash (buf) TXTINF buf; /** Line of text **/ { register unsigned int chksum; /** Checksum **/ register int i; /** Counter **/ char *s; /** Pointer **/ HASH hash; /** Hash code value **/ /** Build up a checksum of the characters in the text **/ for (chksum = 0, s = buf.content, i = 0; i < buf.len; chksum ^= *s++, i++) { ; } /** Combine the 7-bit checksum and as much of the length as is possible **/ hash = ((chksum & 0x7F) | ((s - buf.content) << 7)) & MASK; return (hash); } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** fnd_seq() finds a "good sequence" of lines within the given ** starting and ending line numbers. fnd_seq() then recursively ** finds "good sequences" in the sections of lines above the ** "good sequence" and below it. ** ** RETURN VALUE: ** ** none ** **-- */ static void fnd_seq (f1, beg1, end1, f2, beg2, end2, lngval) FILEINF *f1; /** File info for #1 **/ int beg1; /** First line # to compare in #1 **/ int end1; /** Last line # to compare in #1 **/ FILEINF *f2; /** File info for #2 **/ int beg2; /** First line # to compare in #2 **/ int end2; /** Last line # to compare in #2 **/ int lngval; /** "Long enough" value **/ { LINEINF *line1, /** Line info ptr in #1 **/ *line2; /** Line info ptr in #2 **/ register int limit; /** Looping limit **/ int ln1, /** Line number in #1 **/ ln2; /** Line number in #2 **/ register int ln; /** Working line number **/ BOOLEAN go; /** Continue to loop? **/ int most, /** Longest possible seq **/ most1, /** Longest possible due to #1 **/ most2; /** Longest possible due to #2 **/ int cnt, /** Length of longest seq **/ oldcnt, /** Length of prev longest seq **/ n, /** Length of cur longest seq **/ m1, /** Line of longest seq in #1 **/ m2; /** Line of longest seq in #2 **/ /** Initialize **/ go = TRUE; line1 = f1->line; line2 = f2->line; /** Initialize longest sequence info **/ cnt = 0; /** Length of longest seq **/ m1 = beg1 - 1; /** Line # of longest seq in #1 **/ m2 = beg2 - 1; /** Line # of longest seq in #2 **/ oldcnt = 0; /** Length of prev longest seq **/ /* ** Calculate max possible number of consecutive lines that ** can match (based on line # range) **/ most1 = end1 - beg1 + 1; most2 = end2 - beg2 + 1; /* ** Scan lines looking for a "good sequence". ** Compare lines in the following order of line numbers: ** ** (1, 1) ** (1, 2), (2, 1), (2, 2) ** (1, 3), (2, 3), (3, 1), (3, 2), (3, 3) ** ... **/ for (ln1 = beg1, ln2 = beg2; TRUE; ln1++, ln2++) { if (ln2 <= end2 - cnt) /* ** There are enough lines left in #2 such that it ** is possible to find a longer sequence. **/ { /* ** Determine the limit in #1 that both ** enforces the order scheme and still makes ** it possible to find a longer sequence. **/ limit = min (ln1 - 1, end1 - cnt); /** Calculate first potential match in #1 **/ for (ln = f1->hashtbl[line2[ln2].hash].frst; ln && ln < beg1; ln = line1[ln].nxtln) { ; } /** Loop through the lines in #1 **/ for (; ln && ln <= limit; ln = line1[ln].nxtln) { if (line1[ln].hash == line2[ln2].hash && line1[ln+cnt].hash == line2[ln2+cnt].hash && !(ln-m1 == ln2-m2 && ln < m1+cnt && m1 != beg1-1)) { /* ** A candidate for a longer sequence has been found ** The current lines match, the currents + cnt match, ** and this sequence is not a subset of the longest ** sequence so far. **/ /** Calculate most possible matches **/ most = min (end1-ln+1, most2); /* ** First compare hash codes. If the number of matches ** exceeds the longest sequence so far, then compare ** the actual text. **/ if (chk_hashes(line1+ln, line2+ln2, cnt) && (n = cnt_matches (f1->txt+ln, f2->txt+ln2, most)) > cnt) { /** This is the longest seq. so far **/ /** Update longest seq info **/ oldcnt = cnt; cnt = n; m1 = ln; m2 = ln2; /** If it's long enough, end the search **/ if (cnt >= lngval) { break; } /** Update limit, using new count **/ limit = min (ln1 - 1, end1 - cnt); } } } /** If it's long enough, end the search **/ if (cnt >= lngval) { break; } most2--; } else { go = FALSE; /** EOF of this is reached **/ } /** Repeat the process for the other file **/ if (ln1 <= end1 - cnt) { limit = min (ln2, end2 - cnt); for (ln = f2->hashtbl[line1[ln1].hash].frst; ln && ln < beg2; ln = line2[ln].nxtln) { ; } for (; ln && ln <= limit; ln = line2[ln].nxtln) { if (line1[ln1].hash == line2[ln].hash && line1[ln1+cnt].hash == line2[ln+cnt].hash && !(ln1-m1 == ln-m2 && ln1 < m1+cnt && m2 != beg2-1)) { most = min (end2 - ln + 1, most1); if (chk_hashes (line1 + ln1, line2 + ln, cnt) && (n = cnt_matches (f1->txt + ln1, f2->txt + ln, most)) > cnt) { oldcnt = cnt; cnt = n; m1 = ln1; m2 = ln; if (cnt >= lngval) { break; } limit = min (ln2, end2 - cnt); } } } if (cnt >= lngval) { break; } most1--; } else if (!go) { break; /** EOF of this file is reached **/ } } /* ** If the longest seq is shorter than the "long engoug" value, ** the "long enough" value can be adjusted for the rest of ** the comparison process. **/ if (cnt < lngval) { lngval = cnt; } if (cnt >= 1) /** Longest seq exceeds min necessary size **/ { if (m1 != beg1 && m2 != beg2 && oldcnt > 0) /* ** There is still something worgh comparing previous to ** the seq. **/ { /** Use knowledge of the previous longest seq **/ fnd_seq (f1, beg1, m1 - 1, f2, beg2, m2 - 1, oldcnt); } /** Report the seq **/ rpt_seq (f1, m1, f2, m2, cnt); if (m1 + cnt - 1 != end1 && m2 + cnt - 1 != end2) /** There is still someghing worth comparing subseq to the seq **/ { fnd_seq (f1, m1 + cnt, end1, f2, m2 + cnt, end2, lngval); } } } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** chk_hashes() determines whether this seq of matching ** hash codes is longer than cnt. It knows that the first ** pair of hash codes is guaranteed to match. ** ** RETURN VALUE: ** ** TRUE if this seq is longer than cnt. ** FALSE if this seq is not longer than cnt. ** **-- */ static BOOLEAN chk_hashes (line1, line2, cnt) LINEINF *line1, /** Line info for #1 **/ *line2; /** Line info for #2 **/ register int cnt; /** Counter to try to exceed **/ { register int n; /** Counter of consecutive matches **/ for (n = 1; n <= cnt && (++line1)->hash == (++line2)->hash; n++) { ; } return (n > cnt); } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** cnt_matches() counts the numbner of consecutive matching ** lines of text. ** ** RETURN VALUE: ** ** Number of consecutive matching lines. ** **-- */ static int cnt_matches (s1, s2, most) TXTINF *s1, /** Starting line in file #1 **/ *s2; /** Starting line in file #2 **/ register int most; /** Most matching lines possible **/ { register int n; /** Count of consecutive matches **/ /** Count the consecutive matches **/ for (n = 0; n < most && txtcmp (*s1++, *s2++) == ALIKE; n++) { ; } return (n); } int txtcmp (s, t) TXTINF s, t; { register int i; int c; if (s.len > t.len) { return 1; } else if (s.len < t.len) { return -1; } else { for (i = 0; i < s.len; i++) { if ((c = s.content[i]-t.content[i])) { return (c > 0? 1 : -1); } } } return 0; } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** rpt_seq() reports a matching seq of lines ** ** RETURN VALUE: ** ** None ** **-- */ static void rpt_seq (f1, m1, f2, m2, cnt) FILEINF *f1; /** File info for #1 **/ int m1; /** Location of matching seq in #1 **/ FILEINF *f2; /** File info for #2 **/ int m2; /** Location of matching seq in #2 **/ int cnt; /** Number of lines in matching seq **/ { if (!(m1 == 1 && m2 == 1)) { rpt_seq_1 (f1, m1, cnt, f2, m2, cnt); } diff_beg1 = m1+cnt-1; diff_beg2 = m2+cnt-1; } /* **++ ** FUNCTIONAL DESCRIPTION: ** ** rpt_seq_1() ouputs diff report for a section ** ** RETURN VALUE: ** ** None ** **-- */ static void rpt_seq_1 (f1, m1, cnt1, f2, m2, cnt2) FILEINF *f1; /** File info for #1 **/ int m1; /** Location of matching seq in #1 **/ int cnt1; /** Number of lines in matching seq in #1 **/ FILEINF *f2; /** File info for #2 **/ int m2; /** Location of matching seq in #2 **/ int cnt2; /** Number of lines in matching seq in #2 **/ { struct FAB fab; struct RAB rab; unsigned long retcode; static char linebuf[MAXCHARS+1]; $DESCRIPTOR (linebuf_descriptor, linebuf); char *filename = "SYS$OUTPUT"; int i; int len; /* ** Initialize FAB **/ fab = cc$rms_fab; fab.fab$l_fna = filename; fab.fab$b_fns = strlen(filename); /* ** Initialize Source File RAB **/ rab = cc$rms_rab; rab.rab$l_fab = &fab; /* ** Report difference **/ if (sys$open (&fab, 0, 0) != RMS$_NORMAL) { fprintf(stderr, "Insufficient privilege for write operation"); return; } rab.rab$b_rac = RAB$C_SEQ; rab.rab$l_rbf = linebuf; if ((retcode = sys$connect (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for connect operation"); return; } strcpy (linebuf, "************\r\n"); rab.rab$w_rsz = strlen(linebuf); if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } sprintf (linebuf, "File %s\r\n", f1->filename); rab.rab$w_rsz = strlen(linebuf); if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } for (i = diff_beg1+1; i <= m1; i++) { sprintf (linebuf, "%6.6d ", i); len = strlen (linebuf); memcpy (linebuf+len, f1->txt[i].content, f1->txt[i].len); memcpy (linebuf+len+f1->txt[i].len, "\r\n", 2); rab.rab$w_rsz = f1->txt[i].len+len+2; if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } } strcpy (linebuf, "******\r\n"); rab.rab$w_rsz = strlen(linebuf); if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } sprintf (linebuf, "File %s\r\n", f2->filename); rab.rab$w_rsz = strlen(linebuf); if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } for (i = diff_beg2+1; i <= m2; i++) { sprintf (linebuf, "%6.6d ", i); len = strlen (linebuf); memcpy (linebuf+len, f2->txt[i].content, f2->txt[i].len); memcpy (linebuf+len+f2->txt[i].len, "\r\n", 2); rab.rab$w_rsz = f2->txt[i].len+len+2; if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { sys$close(&fab, 0, 0); fprintf(stderr, "Insufficient privilege for write operation"); return; } } strcpy (linebuf, "************\r\n"); rab.rab$w_rsz = strlen(linebuf); if ((retcode = sys$put (&rab, 0, 0)) != RMS$_NORMAL) { fprintf(stderr, "Insufficient privilege for write operation"); } diff_sec++; diff_line += max (m1-diff_beg1-1, m2-diff_beg2-1); sys$close (&fab, 0, 0); }