.globl my_memcmp my_memcmp: pushl %ebp # Save some registers movl %esp,%ebp pushl %ebx movl 0x8(%ebp),%ebx # Load S1 movl 0xc(%ebp),%edx # Load S2 movl 0x10(%ebp),%eax # Load length cmpl $0x10,%eax # For small strings jb 7f # we cannot afford startup overheads testb $3,%bl # Check if EBX is already aligned jz 2f # Align EBX .align 16 1: movb (%ebx),%cl # If not, compare byte-by-byte cmpb (%edx),%cl jne 9f # Until we have a mismatch incl %ebx incl %edx decl %eax testb $3,%bl # Or EBX is aligned jnz 1b # EBX is aligned, check if EDX is .align 4 2: push %edi # Save more callee-save regs push %esi testb $3,%dl # If EDX is aligned too jnz 4f # use simpler and faster code subl $4,%eax # Save a `cmp $4,%eax' below # Loop for aligned EBX and EDX .align 16 3: movl (%edx),%esi # Compare a DWORD cmpl %esi,(%ebx) jne 8f addl $4,%ebx # Go on with the next one addl $4,%edx # if they match subl $4,%eax ja 3b addl $4,%eax # Restore the loop counter popl %esi # Beginning of epilog popl %edi jmp 6f # And compare byte-by-byte # Set up loop for aligned EBX and unaligned EDX .align 4 4: movl %edx,%ecx # Load the low bits of S2 into ECX andl $3,%ecx andl $~3,%edx # And align EDX to lower dword subl $4,%eax # Save a `cmp $4,%eax' below movl %ecx,%ebp # Save lower bits shll $3,%ecx # Byte offset --> bit offset movl (%edx),%esi # Load two DWORDs movl 0x4(%edx),%edi # Loop for aligned EBX and unaligned EDX # # Example: EDX was unaligned by 1, hence CL = 8 # # ESI EDI # ,-----.-----.-----.-----. ,-----.-----.-----.-----. # | 2 | 1 | 0 | -1 | | 6 | 5 | 4 | 3 | # `-----'-----'-----'-----' `-----'-----'-----'-----' # ,-. # | | # _| |_ after # \ / shrdl $8, %edi, %esi # `v' # ESI EDI # ,-----.-----.-----.-----. ,-----.-----.-----.-----. # | 3 | 2 | 1 | 0 | | 6 | 5 | 4 | 3 | # `-----'-----'-----'-----' `-----'-----'-----'-----' # compared against (%ebx) used for the next iteration # # ESI EDI # ,-----.-----.-----.-----. ,-----.-----.-----.-----. # | 6 | 5 | 4 | 3 | | 10 | 9 | 8 | 7 | # `-----'-----'-----'-----' `-----'-----'-----'-----' # # etc. .align 16 5: shrdl %cl,%edi,%esi # Compute an unaligned DWORD cmpl %esi,(%ebx) # Compare it with the aligned EBX jne 8f movl %edi,%esi movl 0x8(%edx),%edi # Load another DWORD if they matched addl $4,%edx addl $4,%ebx subl $4,%eax ja 5b orl %ebp,%edx # When few bytes remain, work again # on the unaligned EDX addl $4,%eax # Restore the loop counter popl %esi # Beginning of epilog popl %edi # Byte-by-byte loop .align 16 6: movb (%ebx),%cl # Compare byte-by-byte cmpb (%edx),%cl jne 9f incl %ebx # Go on if they matched incl %edx decl %eax jne 6b popl %ebx # Epilog popl %ebp retl # Return EAX = 0 # Handle LEN < 16 .align 16 7: orl %eax,%eax # If no characters to be compared jnz 6b # Exit immediately popl %ebx # Epilog popl %ebp retl # Return EAX = 0 # Compute return value for mismatching (%EBX) and %ESI .align 16 8: movl (%ebx),%edi # Load the two mismatching DWORDs bswapl %esi # Make them big-endian bswapl %edi cmpl %esi,%edi # So that we can compare them popl %esi # Beginning of epilog popl %edi # Compute return value from flags .align 4 9: sbbl %eax,%eax # -1 if <, 0 if > orl $1,%eax # -1 if <, 1 if > popl %ebx # Epilog popl %ebp retl