Category : Assembly Language Source Code
Archive   : SIMILF.ZIP
Filename : SIMILF.ASM

 
Output of file : SIMILF.ASM contained in archive : SIMILF.ZIP

;
; S I M I L F . A S M
; =================
; Entered & adapted by J Meeks (& Herb Martin a little); latest
; version 05-09-89, 00:13am.
;
; Adapted from Dr. Dobb's Journal, 07-88 issue, pp. 46-51 (text), pp.
; 68-72 (program listing).
;
; Written by John W. Ratcliff & David E. Metzener, 11-10-87.
;
; Uses the Ratcliff/Obershelp pattern recognition algorithm.
;
; This program provides a new function for Turbo Pascal 5.0 on an
; 8086-based machine. The function similf returns a percentage value
; corresponding to the degree to which two strings are similar to one
; another. Be certain to convert both strings to uppercase if you
; want to have a comparison that is case-insensitive.
;
; NOTE: This routine is for use with Turbo Pascal 5.0 programs. The
; original simil is for use with C programs, and is almost identical
; to this one.
;
TITLE SIMILF.ASM
;
ASSUME CS: Code, DS: Data, ES: Data
;
DATA SEGMENT
;
ststr1l dw 25 dup (?) ; Lefts for string 1.
ststr1r dw 25 dup (?) ; Rights for string 1.
ststr2l dw 25 dup (?) ; Lefts for string 2.
ststr2r dw 25 dup (?) ; Rights for string 2.
stcknum dw ? ; # elements on stack.
score dw ? ; # chars in common * 2.
total dw ? ; Total # chars in strings 1 & 2.
cl1 dw ? ; Left of string 1 found in common.
cr1 dw ? ; Right of string 1 found in common.
cl2 dw ? ; Left of string 2 found in common.
cr2 dw ? ; Right of string 2 found in common.
s2ed dw ? ; End of string 2 used in compare.
;
DATA ENDS
PUBLIC similf
;
Code SEGMENT
;
similf PROC NEAR
;
; This routine expects pointers passed to two character strings, null
; terminated, that you wish compared. It returns a percentage value
; from 0% to 100% corresponding to how alike the two strings are.
;
; Usage: Simil (Str1, Str2: string);
;
; The similarity routine is composed of three major components:
; compare Finds the largest group of characters in common between
; any two string sections.
; pushst Pushes a string section to be compared onto the stack.
; popst Pops a string section to be examined off the stack.
;
; The similarity routine begins by computing the total length of both
; strings passed and placing that value in total. It then takes the
; beginnings and endings of both strings passed and pushes them onto
; the stack. It then falls through into the main line code.

; The original two strings are immediately popped off the stack and
; are passed to the compare routine. The compare routine finds the
; largest group of characters in common between the two strings.

; The number of characters in common is multiplied by two and added
; to the total score. If there were no characters in common, then
; there is nothing to push onto the stack. If there is exactly one
; character to the left in both strings then we needn't push them
; onto the stack (we already know they aren't equal from the previous
; call to compare). Otherwise the characters on the left are pushed
; onto the stack. These same rules apply to characters to the right
; of the substring found in in common. This process of popping
; substrings off the stack, comparing them, and pushing remaining
; sections onto the stack is continued until the stack is empty.

; On return, the total score is divided by the number of characters
; in both strings. This is multiplied by 100 to yield a percentage.
; This percentage similarity is returned to the calling procedure.

push bp ; Save BP.
mov bp, sp ; Save SP in BP for use in program.
push es ; Save ES.
mov ax, ds ; Copy DS into ES.
mov es, ax
xor ax, ax ; Clear AX.
mov score, ax ; Clear score variable.
mov stcknum, ax ; Clear number of stack entries.
mov si, [bp+4] ; Move string 1 begin ptr to SI.
mov di, [bp+8] ; Move string 2 begin ptr to DI.
cmp [si], al ; Is string 1 a null string?
je strerr ; Can't process null strings!
cmp [di], al ; Is string 2 a null string?
jne docmp ; Neither string null; process 'em.
strerr: jmp donit ; Jump to exit routine.
docmp: push di ; Save DI (because of SCAS).
push si ; Save SI (because of SCAS).
xor al, al ; Clr AL to search for string end.
cld ; Clr direction flag to forward.
mov cx, -1 ; Repeat correct # times.
repnz scasb ; Scan for delimiter in string 2.
dec di
dec di ; Point DI to last char of str 2.
mov bp, di ; Move DI to BP, where should be.
pop di ; Original SI into DI for scasb.
repnz scasb ; Scan for delimiter in string 1.
not cx ; 1's comp of string 1's lentgh.
sub cx, 2 ; - 2 zero bytes at end string 1.
mov total, cx ; Sum of strings' lengths in total.
dec di
dec di ; Point DI to last char of str 1.
mov bx, di ; Move DI to BX, where should be.
pop di ; Restore original DI.
call pushst ; Push for 1st call to compare.
main: cmp stcknum, 0 ; Is there anything on the stack?
je done ; No. Nothing to compare!
call popst ; Set up regs for a compare call.
call compare ; Do compare for substring set.
cmp dx, 0 ; Nothing in common to push.
je main ; Try another set substrings.
shl dx, 1 ; Multiply # common chars by 2.
add score, dx ; Add 2 * common chars to total.
mov bp, stcknum ; Get # of entry to be looked at.
shl bp, 1 ; Ready AX to access string stk.
mov si, [ststr1l+bp] ; Move l1 into SI (or l1).
mov bx, cl1 ; Move cl1 into BX (or r1).
mov di, [ststr2l+bp] ; Move l2 into DI (or l2).
mov cx, cl2 ; Move cl2 into CX temporarily.
mov ax, [ststr1r+bp] ; Get old r1 off stack.
mov cl1, ax ; Place in cl1 temporarily.
mov ax, [ststr2r+bp] ; Get old r2 off stack.
mov cl2, ax ; Place in cl2 temporarily.
mov bp, cx ; Move cl2 into BP.
cmp bx, si ; Compare cl1 to l1.
je chrght ; Nothing on left side of string 1.
cmp bp, di ; Compare cl2 to l2.
je chrght ; Nothing on left side of string 2.
dec bx ; Point last part left side str 1.
dec bp ; Point last part left side str 2.
cmp bx, si ; Only one char to compare?
jne pushit ; No. We need to examine this.
cmp bp, di ; Only one character in both?
je chrght ; Only 1 char; nothing to look at.
pushit: call pushst ; Push left side onto stack.
chrght: mov si, cr1 ; Move cr1 into SI (or l1).
mov bx, cl1 ; Move r1 into BX (or r1).
mov di, cr2 ; Move cr2 into DI (or l2).
mov bp, cl2 ; Move r2 into BP (or r2).
cmp si, bx ; Compare cr1 to r1.
je main ; Nothing on right side of str 1.
cmp di, bp ; Compare cr2 to r2.
je main ; Nothing on right side of str 2.
inc si ; Point last part right side str 1.
inc di ; Point last part right side str 2.
cmp bx, si ; Only one character to examine?
jne push2 ; No. Examine it.
cmp bp, di ; Only one character in both?
je main ; Yes. Get next strs off stack.
push2: call pushst ; Push right side onto stack.
jmp short main ; Do next level of compares.
done: mov ax, score ; Get score into AX for multiply.
mov cx, 100 ; Put 100 into CX for multiply.
mul cx ; Multiply score by 100.
mov cx, total ; Get total # chars for divide.
div cx ; Div (score * 100) by # chars.
donit: pop es ; Restore ES to entry value.
POP BP ; Restore BP to entry value.
ret 8 ; Return; AX = % similarity.
similf ENDP
;
;
compare PROC NEAR
;
; The compare procedure locates the largest matching group of
; characters between string 1 and string 2. The routine assumes
; the CPU's direction flag is cleared.
;
; Pass to this routine:
; DS:SI l1 (left side of string 1),
; BX r1 (right side of string 1),
; ES:DI l2 (left side of string 2),
; BP r2 (right side of string 2).
; The routine returns:
; DX # characters matching,
; cl1 Left side of first matching substring,
; cl2 Left side of second matching substring,
; cr1 Right side of first matching substring,
; cr2 Right side of second matching substring.
;
; The compare routine is composed of two loops, an inner and an
; outer. The worst case scenario is that there are absolutely no
; characters in common between string 1 and string 2. In this case
; N * M compares are performed. However, when a substring match
; occurs in the inner loop, then the next character to be examined in
; string 2 (for this loop) is advanced by the number of characters
; found equal. Whenever a new maximum number of characters in common
; is found then the ending locations of both the inner and outer
; loops are backed off by the difference between the new max chars
; value and the old max chars value; i.e., if 5 characters have been
; found in common part of the way through the search then we can cut
; our search short by 5 characters since there is no chance of
; finding better than a 5-character match at that point. This
; technique means that an exact match will require only a single
; compare, and combinations of other matches will proceed as
; efficiently as possible.
;
; Note that we branch downward for both newmax and newmax2, because
; on the 8086 line of processors a branch not taken is much faster
; than one that is taken. Since the not-equal condition is to be
; found most often, and we would like the inner loop to execute as
; quickly as possible, we branch outside the loop's body on the less
; frequent occurence. When a match and/or a new maxchars is found
; then we branch down to these two routines, process the new
; conditions, then jump back up into the main line loop code.
mov s2ed, bp ; Store end of string 2.
xor dx, dx ; Initialize maxchars.
forl3: push di ; Save start of string 2.
forl4: push di ; Save start of string 2.
push si ; Save start of string 1.
mov cx, s2ed ; Set up to calc length of str 1.
sub cx, di ; Get (length of string 1) - 1.
inc cx ; Get length of string 1.
push cx ; Save starting length of str 1.
repz cmpsb ; Compare strings.
jz equal ; If equal, skip fixes.
inc cx ; Back up (CMPSB decs even if <>).
equal: pop ax ; Get starting length of string 1.
sub ax, cx ; Get length of common substring.
jnz newmax ; More than zero chars matched.
pop si ; Get back start of string 1.
pop di ; Get back start of string 2.
reent: inc di ; Do next char no matter what.
reent2: cmp di, bp ; Are we done with string 2?
jle forl4 ; No. Do next string compare.
pop di ; Get back start of string 2.
inc si ; Next char in string 1 to scan.
cmp si, bx ; Are we done with string 1?
jle forl3 ; No. Do next string compare.
ret ; Return with maxchars in DX.
;
newmax: cmp ax, dx ; Greater than maxchars?
jg newmx2 ; Yes. Update new maxchars & ptrs.
pop si ; Get back start of string 1.
pop di ; Get back start of string 2.
add di, ax ; Skip past matching chars.
jmp short reent2 ; Re-enter inner loop.
newmx2: pop si ; Get back start of string 1.
pop di ; Get back start of string 2.
mov cl1, si ; Put beginning match of str 1.
mov cl2, di ; Put beginning match of str 2.
mov cx, ax ; Save new maxchars.
sub ax, dx ; Get delta for str end adjustment.
sub bx, ax ; Adjust end of string 1.
sub bp, ax ; Adjust end of string 2.
mov dx, cx ; New maxchars.
dec cx ; Set up advance last match char.
add di, cx ; Advance to last match char str 2.
mov cr2, di ; Put ending of match of string 2.
add cx, si ; Advance to last match char str 1.
mov cr1, cx ; Put ending of match of string 1.
jmp short reent ; Re-enter inner loop.
;
compare ENDP
pushst PROC NEAR
;
; Pass to this routine:
; DS:SI l1 (left side of string 1),
; BX r1 (right side of string 1),
; ES:DI l2 (left side of string 2),
; BP r2 (right side of string 2).
;
mov cx, bp ; Save r2.
mov bp, stcknum ; Get # entries on stack.
shl bp, 1 ; Multiply by 2 for words.
mov [bp+ststr1l], si ; Put left side string 1 on stack.
mov [bp+ststr1r], bx ; Put right side string 1 on stack.
mov [bp+ststr2l], di ; Put left side string 2 on stack.
mov [bp+ststr2r], cx ; Put right side string 2 on stack.
inc stcknum ; Add 1 to number stack entries.
mov bp, cx ; Restore r2.
ret ; Return.
;
pushst ENDP
;
;
popst PROC NEAR
;
; Pass to this routine:
; DS:SI l1 (left side of string 1),
; BX r1 (right side of string 1),
; ES:DI l2 (left side of string 2),
; BP r2 (right side of string 2).
;
dec stcknum ; Point to last entry in stack.
mov bp, stcknum ; Get # entries on stack.
shl bp, 1 ; * 2 for words.
mov si, [bp+ststr1l] ; Restore left string 1 from stk.
mov bx, [bp+ststr1r] ; Restore right string 1 from stk.
mov di, [bp+ststr2l] ; Restore left string 2 from stk.
mov bp, [bp+ststr2r] ; Restore right string 2 from stk.
ret ; Return.
;
popst ENDP
;
;
Code ENDS
;
END ; of file SimilF.Asm.