[ create a new paste ] login | about

Link: http://codepad.org/smKMLe4Z    [ raw code | fork ]

Plain Text, pasted on Jul 31:
.section .text.memmove,"ax",%progbits
.global memmove
.global memcpy
.type memmove, %function
.type memcpy, %function
memmove:                            @ Copy memory forward or backward, allows overlapping src/dest
memcpy:                             @ args: R0: dest, R1: src, R2: len; returns: dest
    @ Check if we need to go backwards
    cmp r1, r0                      @ TEST src - dst
    bcc 9f                          @ IF src < dest: THEN GOTO [backward]
    @ If src == dst we don't need to do anything at all
    it eq                           @ IF src == dest:
    bxeq lr                         @     THEN return dest
    stmfd sp!, {r0, lr}             @ SAVE return_addr orig_dest                         @ STACK: orig_dest return_addr
                                    @ (fallthrough)

#if defined(SQUEEZE)
@ Absolute minimum size forward implementation:
1:                                  @ DO:
    @ Stupidly copy a byte at a time
    subs r2, r2, #1                 @     len--
    it lt                           @     IF len < 0:
    ldmfdlt sp!, {r0, pc}           @         THEN RETURN orig_dest
    ldrb r3, [r1], #1               @     R3 = *src++ (byte)
    strb r3, [r0], #1               @     *dest++ = R3 (byte)
    b 1b                            @ WHILE true
@ End of absolute minimum size forward implementation

#elif defined(OPTIMIZE_SIZE)
@ Size optimized forward implementation:
    @ Skip to tail if we have to copy less than 4 bytes
    subs r2, r2, #4                 @ len -= 4                                           @ len offset: -4
    blt 2f                          @ IF len < 0: THEN GOTO [remainder]
    @ Skip to tail if src or dest are misaligned
    tst r0, #3                      @ TEST dest & 3
    it eq                           @ IF !(dest & 3):
    tsteq r1, #3                    @     THEN TEST src & 3
    bne 2f                          @ IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
    @ Copy 4 bytes at a time until less than 4 are remaining
1:                                  @ DO:
    ldr r3, [r1], #4                @     R3 = *src++ (word)
    str r3, [r0], #4                @     *dest++ = R3 (word)
    subs r2, r2, #4                 @     len -= 4
    bge 1b                          @ WHILE: len >= 0
                                    @ (fallthrough)
                                    @
2:                                  @ [remainder]: copy the remainder byte by byte
    @ Correct length offset from 4 byte copying mode
    adds r2, r2, #4                 @ len += 4                                           @ len offset: 0
1:                                  @ DO:
    @ Copy the remainder a byte at a time
    it eq                           @     IF len == 0:
    ldmfdeq sp!, {r0, pc}           @         THEN RETURN orig_dest
    ldrb r3, [r1], #1               @     R3 = *src++ (byte)
    strb r3, [r0], #1               @     *dest++ = R3 (byte)
    subs r2, r2, #1                 @     len--
    b 1b                            @ WHILE true
@ End of size optimized forward implementation

#else
@ Performance optimized forward implementation:
    @ Skip to 3 byte tail if we have to copy less than 4 bytes
    subs r2, r2, #4                 @ len -= 4                                           @ len offset: -4
    blt 6f                          @ IF len < 0: THEN GOTO [remainder3]
    @ If the destination is misaligned, align it
    @ This will return to label 3 if the source is aligned after the destination was aligned
    ands r12, r0, #3                @ misalignment = dest & 3
    bne 7f                          @ IF misalignment != 0: THEN GOTO [align]
    @ The destination was already aligned, but the source isn't
    @ We have no option but to fall back to byte by byte copying
    tst r1, #3                      @ TEST src & 3
    bne 8f                          @ IF src & 3: THEN GOTO [bytewise]
                                    @ (fallthrough)
                                    @
3:                                  @ [aligned]: we are at least 4 byte aligned
    @ Skip to 11 byte tail if we have to copy less than 12 bytes
    subs r2, r2, #8                 @ len -= 8                                           @ len offset: -12
    blt 5f                          @ IF len < 0: THEN GOTO [remainder11]
    @ Skip to 31 byte tail if we have to copy less than 32 bytes
    subs r2, r2, #0x14              @ len -= 20                                          @ len offset: -32
    blt 4f                          @ IF len < 0: THEN GOTO [remainder31]
    @ Save R4 so that we have an additional copying scratchpad register
    str r4, [sp, #-4]!              @ SAVE R4                                            @ STACK: R4 orig_dest return_addr
1:                                  @ DO:
    @ Copy 32 bytes at a time
    ldmia r1!, {r3, r4, r12, lr}    @     {R3,R4,R12,LR} = *src++ (qword)
    stmia r0!, {r3, r4, r12, lr}    @     *dest++ = {R3,R4,R12,LR} (qword)
    ldmia r1!, {r3, r4, r12, lr}    @     {R3,R4,R12,LR} = *src++ (qword)
    stmia r0!, {r3, r4, r12, lr}    @     *dest++ = {R3,R4,R12,LR} (qword)
    subs r2, r2, #0x20              @     len -= 32
    bge 1b                          @ WHILE len >= 0
    @ Less than 32 bytes remaining, copy 16 if enough are remaining
    cmn r2, #0x10                   @ TEST len + 16
    ittt ge                         @ IF len >= -16:
    ldmiage r1!, {r3, r4, r12, lr}  @     THEN {R3,R4,R12,LR} = *src++ (qword)
    stmiage r0!, {r3, r4, r12, lr}  @     THEN *dest++ = {R3,R4,R12,LR} (qword)
    subge r2, r2, #0x10             @     THEN len -= 16
    @ No need for R4 anymore, restore it so that later code doesn't have to take care of it
    ldr r4, [sp], #4                @ RESTORE R4                                         @ STACK: orig_dest return_addr
                                    @ (fallthrough)
                                    @
4:                                  @ [remainder31]: we have less than 32 bytes remaining
    @ Correct length offset from 32 byte copying mode
    adds r2, r2, #0x14              @ len += 20                                          @ len offset: -12
1:                                  @ DO:
    @ Copy 12 bytes at a time, while enough are remaining
    itttt ge                        @     IF len >= 0:
    ldmiage r1!, {r3, r12, lr}      @         THEN {R3,R12,LR} = *src++ (12 bytes)
    stmiage r0!, {r3, r12, lr}      @         THEN *dest++ = {R3,R12,LR} (12 bytes)
    subsge r2, r2, #0x0c            @         THEN len -= 12
    bge 1b                          @ WHILE len >= 12
                                    @ (fallthrough)
                                    @
5:                                  @ [remainder11]: we have less than 12 bytes remaining
    @ Correct length offset from 12 byte copying mode
    adds r2, r2, #8                 @ len += 8                                           @ len offset: -4
    @ Skip to 3 byte tail if less than 4 bytes are remaining
    blt 6f                          @ IF len < 0: THEN GOTO [remainder3]
    @ We will copy at least 4 bytes, adjust length
    subs r2, r2, #4                 @ len -= 4
    @ If less than 8 bytes are remaining, copy 4 bytes
    itt lt                          @ IF len < 0:
    ldrlt r3, [r1], #4              @     THEN R3 = *src++ (word)
    strlt r3, [r0], #4              @     THEN *dest++ = R3 (word)
    @ If at least 8 bytes are remaining, copy 8 bytes
    ittt ge                         @ IF len >= 0:
    ldmiage r1!, {r3, r12}          @     THEN {R3,R12} = *src++ (dword)
    stmiage r0!, {r3, r12}          @     THEN *dest++ = {R3,R12} (dword)
    @ We have subtracted 4 bytes above but copied 8, adjust length
    subge r2, r2, #4                @     THEN len -= 4
                                    @ (fallthrough)
                                    @
6:                                  @ [remainder3]: we have less than 4 bytes remaining, copy them individually
    @ Correct length offset from 4 byte copying mode
    adds r2, r2, #4                 @ len += 4                                           @ len offset: 0
    @ If we're finished, return
    it eq                           @ IF len == 0:
    ldmfdeq sp!, {r0, pc}           @     THEN RETURN orig_dest
    cmp r2, #2                      @ TEST len - 2
    @ We always have to copy at least one byte
    ldrb r3, [r1], #1               @ R3 = *src++ (byte)
    strb r3, [r0], #1               @ *dest++ = R3 (byte)
    @ If we have to copy at least two, copy another one
    itt ge                          @ IF len >= 2:
    ldrbge r3, [r1], #1             @     THEN R3 = *src++ (byte)
    strbge r3, [r0], #1             @     THEN *dest++ = R3 (byte)
    @ If we have to copy more than two (which is alweays 3), copy another one
    itt gt                          @ IF len > 2:
    ldrbgt r3, [r1], #1             @     THEN R3 = *src++ (byte)
    strbgt r3, [r0], #1             @     THEN *dest++ = R3 (byte)
    @ Everything copied, return
    ldmfd sp!, {r0, pc}             @ RETURN orig_dest
                                    @
7:                                  @ [align]: the destination is misaligned, align it
    @ Negate the misalignment to figure out how much we have to adjust
    rsb r12, r12, #4                @ misalignment = 4 - misalignment
    cmp r12, #2                     @ TEST misalignment - 2
    @ We always have to copy at least one byte
    ldrb r3, [r1], #1               @ R3 = *src++ (byte)
    strb r3, [r0], #1               @ *dest++ = R3 (byte)
    @ If we have to copy at least two, copy another one
    itt ge                          @ IF misalignment >= 2:
    ldrbge r3, [r1], #1             @     THEN R3 = *src++ (byte)
    strbge r3, [r0], #1             @     THEN *dest++ = R3 (byte)
    @ If we have to copy more than two (which is alweays 3), copy another one
    itt gt                          @ IF misalignment > 2:
    ldrbgt r3, [r1], #1             @     THEN R3 = *src++ (byte)
    strbgt r3, [r0], #1             @     THEN *dest++ = R3 (byte)
    @ The destination is aligned now, check if there are at least 4 bytes remaining
    subs r2, r2, r12                @ len -= misalignment
    @ Skip to 3 byte tail if not
    blt 6b                          @ IF len < 0: THEN GOTO [remainder3]
    @ If the source is now misaligned, we have to copy byte by byte
    tst r1, #3                      @ TEST src & 3
    @ If not, resume fast copying method above
    beq 3b                          @ IF !(src & 3): GOTO [aligned]
                                    @ (fallthrough)
                                    @
8:                                  @ [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
                                    @ DO:
    @ Stupidly copy a byte at a time
    ldrb r3, [r1], #1               @     R3 = *src++ (byte)
    strb r3, [r0], #1               @     *dest++ = R3 (byte)
    subs r2, r2, #1                 @     len--
    bge 8b                          @ WHILE len >= 0
    @ Less than 4 bytes remaining, use 3 byte tail copying code above
    b 6b                            @ GOTO [remainder3]
@ End of performance optimized forward implementation
#endif

9:                                  @ [backward]: the destination is above the source, so we need to copy backwards
    @ Jump to end of src and dest, and copy backwards
    add r1, r1, r2                  @ src += len
    add r0, r0, r2                  @ dest += len
                                    @ (fallthrough)

#if defined(SQUEEZE)
@ Absolute minimum size backward implementation:
1:                                  @ DO:
    @ Stupidly copy a byte at a time
    subs r2, r2, #1                 @     len--
    it lt                           @     IF len < 0:
    ldmfdlt sp!, {r0, pc}           @         THEN RETURN orig_dest
    ldrb r3, [r1, #-1]!             @     R3 = *--src (byte)
    strb r3, [r0, #-1]!             @     *--dest = R3 (byte)
    b 1b                            @ WHILE true
@ End of absolute minimum size backward implementation

#elif defined(OPTIMIZE_SIZE)
@ Size optimized backward implementation:
    @ Skip to tail if we have to copy less than 4 bytes
    subs r2, r2, #4                 @ len -= 4                                           @ len offset: -4
    blt 2f                          @ IF len < 0: THEN GOTO [remainder]
    @ Skip to tail if src or dest are misaligned
    tst r0, #3                      @ TEST dest & 3
    it eq                           @ IF !(dest & 3):
    tsteq r1, #3                    @     THEN TEST src & 3
    bne 2f                          @ IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
    @ Copy 4 bytes at a time until less than 4 are remaining
1:                                  @ DO:
    ldr r3, [r1, #-4]!              @     R3 = *--src (word)
    str r3, [r0, #-4]!              @     *--dest = R3 (word)
    subs r2, r2, #4                 @     len -= 4
    bge 1b                          @ WHILE: len >= 0
                                    @ (fallthrough)
                                    @
2:                                  @ [remainder]: copy the remainder byte by byte
    @ Correct length offset from 4 byte copying mode
    adds r2, r2, #4                 @ len += 4                                           @ len offset: 0
1:                                  @ DO:
    @ Copy the remainder a byte at a time
    it eq                           @     IF len == 0:
    ldmfdeq sp!, {r0, pc}           @         THEN RETURN orig_dest
    ldrb r3, [r1, #-1]!             @     R3 = *--src (byte)
    strb r3, [r0, #-1]!             @     *--dest = R3 (byte)
    subs r2, r2, #1                 @     len--
    b 1b                            @ WHILE true
@ End of size optimized backward implementation

#else
@ Performance optimized backward implementation:
    @ Skip to 3 byte tail if we have to copy less than 4 bytes
    subs r2, r2, #4                 @ len -= 4                                           @ len offset: -4
    blt 6f                          @ IF len < 0: THEN GOTO [remainder3]
    @ If the destination is misaligned, align it
    @ This will return to label 3 if the source is aligned after the destination was aligned
    ands r12, r0, #3                @ misalignment = dest & 3
    bne 7f                          @ IF misalignment != 0: THEN GOTO [align]
    @ The destination was already aligned, but the source isn't
    @ We have no option but to fall back to byte by byte copying
    tst r1, #3                      @ TEST src & 3
    bne 8f                          @ IF src & 3: THEN GOTO [bytewise]
                                    @ (fallthrough)
                                    @
3:                                  @ [aligned]: we are at least 4 byte aligned
    @ Skip to 11 byte tail if we have to copy less than 12 bytes
    subs r2, r2, #8                 @ len -= 8                                           @ len offset: -12
    blt 5f                          @ IF len < 0: THEN GOTO [remainder11]
    @ Skip to 31 byte tail if we have to copy less than 32 bytes
    subs r2, r2, #0x14              @ len -= 20                                          @ len offset: -32
    blt 4f                          @ IF len < 0: THEN GOTO [remainder31]
    @ Save R4 so that we have an additional copying scratchpad register
    str r4, [sp, #-4]!              @ SAVE R4                                            @ STACK: R4 orig_dest return_addr
1:                                  @ DO:
    @ Copy 32 bytes at a time
    ldmdb r1!, {r3, r4, r12, lr}    @     {R3,R4,R12,LR} = *--src (qword)
    stmdb r0!, {r3, r4, r12, lr}    @     *--dest = {R3,R4,R12,LR} (qword)
    ldmdb r1!, {r3, r4, r12, lr}    @     {R3,R4,R12,LR} = *--src (qword)
    stmdb r0!, {r3, r4, r12, lr}    @     *--dest = {R3,R4,R12,LR} (qword)
    subs r2, r2, #0x20              @     len -= 32
    bge 1b                          @ WHILE len >= 0
    @ Less than 32 bytes remaining, copy 16 if enough are remaining
    cmn r2, #0x10                   @ TEST len + 16
    ittt ge                         @ IF len >= -16:
    ldmdbge r1!, {r3, r4, r12, lr}  @     THEN {R3,R4,R12,LR} = *--src (qword)
    stmdbge r0!, {r3, r4, r12, lr}  @     THEN *--dest = {R3,R4,R12,LR} (qword)
    subge r2, r2, #0x10             @     THEN len -= 16
    @ No need for R4 anymore, restore it so that later code doesn't have to take care of it
    ldr r4, [sp], #4                @ RESTORE R4                                         @ STACK: orig_dest return_addr
                                    @ (fallthrough)
                                    @
4:                                  @ [remainder31]: we have less than 32 bytes remaining
    @ Correct length offset from 32 byte copying mode
    adds r2, r2, #0x14              @ len += 20                                          @ len offset: -12
1:                                  @ DO:
    @ Copy 12 bytes at a time, while enough are remaining
    itttt ge                        @     IF len >= 0:
    ldmdbge r1!, {r3, r12, lr}      @         THEN {R3,R12,LR} = *--src (12 bytes)
    stmdbge r0!, {r3, r12, lr}      @         THEN *--dest = {R3,R12,LR} (12 bytes)
    subsge r2, r2, #0x0c            @         THEN len -= 12
    bge 1b                          @ WHILE len >= 12
                                    @ (fallthrough)
                                    @
5:                                  @ [remainder11]: we have less than 12 bytes remaining
    @ Correct length offset from 12 byte copying mode
    adds r2, r2, #8                 @ len += 8                                           @ len offset: -4
    @ Skip to 3 byte tail if less than 4 bytes are remaining
    blt 6f                          @ IF len < 0: THEN GOTO [remainder3]
    @ We will copy at least 4 bytes, adjust length
    subs r2, r2, #4                 @ len -= 4
    @ If less than 8 bytes are remaining, copy 4 bytes
    itt lt                          @ IF len < 0:
    ldrlt r3, [r1, #-4]!            @     THEN R3 = *--src (word)
    strlt r3, [r0, #-4]!            @     THEN *--dest = R3 (word)
    @ If at least 8 bytes are remaining, copy 8 bytes
    ittt ge                         @ IF len >= 0:
    ldmiage r1!, {r3, r12}          @     THEN {R3,R12} = *--src (dword)
    stmiage r0!, {r3, r12}          @     THEN *--dest = {R3,R12} (dword)
    @ We have subtracted 4 bytes above but copied 8, adjust length
    subge r2, r2, #4                @     THEN len -= 4
                                    @ (fallthrough)
                                    @
6:                                  @ [remainder3]: we have less than 4 bytes remaining, copy them individually
    @ Correct length offset from 4 byte copying mode
    adds r2, r2, #4                 @ len += 4                                           @ len offset: 0
    @ If we're finished, return
    it eq                           @ IF len == 0:
    ldmfdeq sp!, {r0, pc}           @     THEN RETURN orig_dest
    cmp r2, #2                      @ TEST len - 2
    @ We always have to copy at least one byte
    ldrb r3, [r1, #-1]!             @ R3 = *--src (byte)
    strb r3, [r0, #-1]!             @ *--dest = R3 (byte)
    @ If we have to copy at least two, copy another one
    itt ge                          @ IF len >= 2:
    ldrbge r3, [r1, #-1]!           @     THEN R3 = *--src (byte)
    strbge r3, [r0, #-1]!           @     THEN *--dest = R3 (byte)
    @ If we have to copy more than two (which is alweays 3), copy another one
    itt gt                          @ IF len > 2:
    ldrbgt r3, [r1, #-1]!           @     THEN R3 = *--src (byte)
    strbgt r3, [r0, #-1]!           @     THEN *--dest = R3 (byte)
    @ Everything copied, return
    ldmfd sp!, {r0, pc}             @ RETURN orig_dest
                                    @
7:                                  @ [align]: the destination is misaligned, align it
    @ No need to negate the misalignment here, we are going backwards so we have to adjust by the misaligned amount
    cmp r12, #2                     @ TEST misalignment - 2
    @ We always have to copy at least one byte
    ldrb r3, [r1, #-1]!             @ R3 = *--src (byte)
    strb r3, [r0, #-1]!             @ *--dest = R3 (byte)
    @ If we have to copy at least two, copy another one
    itt ge                          @ IF misalignment >= 2:
    ldrbge r3, [r1, #-1]!           @     THEN R3 = *--src (byte)
    strbge r3, [r0, #-1]!           @     THEN *--dest = R3 (byte)
    @ If we have to copy more than two (which is alweays 3), copy another one
    itt gt                          @ IF misalignment > 2:
    ldrbgt r3, [r1, #-1]!           @     THEN R3 = *--src (byte)
    strbgt r3, [r0, #-1]!           @     THEN *--dest = R3 (byte)
    @ The destination is aligned now, check if there are at least 4 bytes remaining
    subs r2, r2, r12                @ len -= misalignment
    @ Skip to 3 byte tail if not
    blt 6b                          @ IF len < 0: THEN GOTO [remainder3]
    @ If the source is now misaligned, we have to copy byte by byte
    tst r1, #3                      @ TEST src & 3
    @ If not, resume fast copying method above
    beq 3b                          @ IF !(src & 3): GOTO [aligned]
                                    @ (fallthrough)
                                    @
8:                                  @ [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
                                    @ DO:
    @ Stupidly copy a byte at a time
    ldrb r3, [r1, #-1]!             @     R3 = *--src (byte)
    strb r3, [r0, #-1]!             @     *--dest = R3 (byte)
    subs r2, r2, #1                 @     len--
    bge 8b                          @ WHILE len >= 0
    @ Less than 4 bytes remaining, use 3 byte tail copying code above
    b 6b                            @ GOTO [remainder3]
@ End of performance optimized backward implementation
#endif
.size memmove, . - memmove
.size memcpy, . - memcpy


Create a new paste based on this one


Comments: