/************************************************************************** ** Actor = ** ** Component = Libraries ** Module = SPARC/memset.x ** ** Synopsis = System V memset. ** ** Originally written by = Gordon Irlam ** Responsible = Gordon Irlam ** ** Copyright (c) 1990 Department of Computer Science, Adelaide University ** *************************************************************************** ** $Header: /usr/home/projects/mars/people/gordoni/rcs.files/chorus/chorus_3.2/lib/SPARC/RCS/utMemSet.x,v 1.2 90/09/10 14:11:30 gordoni Exp $ *************************************************************************** * The interface between assembly code and C code follows the Sparc register * conventions. This requires the caller to place parameters in %o0 through * %o5. The return value, if any is returned in %o0. Register %o6 is the stack * pointer, and %o7 stores the return address. * * The callee can use %o0 through %o5, and %g1 through %g3 for scrach values. * Registers %g4 through %g7 must never be used. Register %o6 must always * contain a pointer to the current top of the stack. All other registers can * be used only if they are restored prior to return. * * The stack grows downwards and the stack pointer must be double word aligned. * On the top of the stack is space for the operating system to store the 16 in * and local registers currently used. This is performed on window overflow, * and context switch. Below this is a word used to hold the address at which * structure return values should be stored. This is followed by six words that * can be used by the called routine to store the first six arguments it was * passed in registers. This is followed by any additional arguments that are * passed. */ ! Exported symbols. .global _amemset ! ANSI C memory fill. /*************************************************************************** * void *memset(void *s, int c, int n): Fill the first n bytes starting at s * with the value c. Return s. *************************************************************************** */ _amemset: tst %o2 ! Finished if n < 0. bneg fill_done mov %o0, %o3 ! Current s in %o3 (delay slot). cmp %o2, 7 ble last_fill ! Make sure can fill to a double word boundary. ! Fill byte by byte until reach double word boundary. btst 7, %o3 ! Delay slot. be,a big_fill nop first_chunk: stb %o1, [%o3] ! Fill byte. inc %o3 btst 7, %o3 bne first_chunk dec %o2 ! Delay slot. big_fill: ! Fill %o4 and %o5 with the fill value. and %o1, 0xff, %o1 ! Make sure fill value is a single byte. sll %o1, 8, %o4 or %o4, %o1, %o4 ! Fill value in low halfword of %04 sll %o4, 16, %o5 or %o4, %o5, %o4 ! Fill value in entire word of %o4. mov %o4, %o5 deccc 64, %o2 ! See if can fill a 64 byte chunk. bneg,a small_fill inc 64, %o2 ! Delay slot. big_chunk: std %o4, [%o3 + 0] ! Fill a 64 byte chunk. std %o4, [%o3 + 8] std %o4, [%o3 + 16] std %o4, [%o3 + 24] std %o4, [%o3 + 32] std %o4, [%o3 + 40] std %o4, [%o3 + 48] deccc 64, %o2 std %o4, [%o3 + 56] bpos big_chunk inc 64, %o3 ! Delay slot. inc 64, %o2 small_fill: deccc 8, %o2 ! See if can fill an 8 byte chunk. bneg,a last_fill inc 8, %o2 ! Delay slot. small_chunk: deccc 8, %o2 std %o4, [%o3 + 0] ! Fill an 8 byte chunk. bpos small_chunk inc 8, %o3 ! Delay slot. inc 8, %o2 last_fill: deccc %o2 ! See if can fill next byte. bneg,a fill_done nop last_chunk: deccc %o2 stb %o1, [%o3] ! Fill byte. bpos,a last_chunk inc %o3 ! Delay slot. fill_done: retl ! Return value will still be in %o0. nop /************************************************************************** ** Actor = ** ** Component = Libraries ** Module = SPARC/memmove.x ** ** Synopsis = System V memmove. ** ** Originally written by = Gordon Irlam ** Responsible = Gordon Irlam ** ** Copyright (c) 1990 Department of Computer Science, Adelaide University ** *************************************************************************** ** $Header: /a/berlioz/ed/projects/mars/people/gordoni/rcs.files/chorus/chorus_3.2/lib/SPARC/RCS/utMemMove.x,v 1.3 90/09/27 18:18:40 gordoni Exp $ *************************************************************************** * The interface between assembly code and C code follows the Sparc register * conventions. This requires the caller to place parameters in %o0 through * %o5. The return value, if any is returned in %o0. Register %o6 is the stack * pointer, and %o7 stores the return address. * * The callee can use %o0 through %o5, and %g1 through %g3 for scrach values. * Registers %g4 through %g7 must never be used. Register %o6 must always * contain a pointer to the current top of the stack. All other registers can * be used only if they are restored prior to return. * * The stack grows downwards and the stack pointer must be double word aligned. * On the top of the stack is space for the operating system to store the 16 in * and local registers currently used. This is performed on window overflow, * and context switch. Below this is a word used to hold the address at which * structure return values should be stored. This is followed by six words that * can be used by the called routine to store the first six arguments it was * passed in registers. This is followed by any additional arguments that are * passed. */ ! Exported symbols. .global _amemmove ! ANSI C Overlapping memory copy. /*************************************************************************** * void *memmove(void *dest, void *src, int n): Copy n bytes from src to dest. * Safe to use even if memory areas overlap. Return dest. *************************************************************************** */ _amemmove: tst %o2 ! Finished if n <= 0. ble copy_done cmp %o0, %o1 ! Copy backwards if dest >= src. bgeu bwd_copy cmp %o2, 7 ! Delay slot. ble fwd_last_copy ! Make sure can copy first few bytes. mov %o0, %o3 ! Current destination in %o3 (delay slot). ! Copy slowly until reach word boundary for source. btst 3, %o1 ! Delay slot. be fwd_third_copy btst 1, %o1 ! Delay slot. be,a fwd_second_copy lduh [%o1], %o4 ! Delay slot. ldub [%o1], %o4 ! Copy byte. inc %o1 stb %o4, [%o3] inc %o3 btst 3, %o1 be fwd_third_copy dec %o2 ! Delay slot. lduh [%o1], %o4 ! Copy half word. fwd_second_copy: inc 2, %o1 srl %o4, 8, %o5 stb %o5, [%o3 + 0] stb %o4, [%o3 + 1] inc 2, %o3 dec 2, %o2 fwd_third_copy: btst 3, %o3 beq fwd_word_copy ! Destination mod 4 = 0. btst 4, %o3 ! Delay slot. ! Copy next word if destination is in low half of a double word. bne,a fwd_test_alignment deccc 32 + 4, %o2 ! Delay slot. ld [%o1], %o4 ! Copy word. inc 4, %o1 srl %o4, 24, %g3 srl %o4, 16, %g2 srl %o4, 8, %o5 stb %g3, [%o3 + 0] stb %g2, [%o3 + 1] stb %o5, [%o3 + 2] stb %o4, [%o3 + 3] inc 4, %o3 deccc 32 + 4 + 4, %o2 ! See if can copy a 32 + 4 byte chunk. fwd_test_alignment: bneg,a fwd_small_byte_copy inc 32 + 4, %o2 ! Delay slot. ! Call appropriate routine according to alignment of destination. btst 1, %o3 beq fwd_byte_2_copy ! Destination mod 4 = 2. btst 2, %o3 beq,a fwd_byte_1_copy ! Destination mod 4 = 1. nop ba,a fwd_byte_3_copy ! Destination mod 4 = 3. fwd_word_copy: ! Copy slowly until destination is double word aligned. beq,a fwd_word_test_alignment deccc 64, %o2 ! See if can copy a 64 byte chunk (delay slot). ld [%o1], %o4 ! Copy word. inc 4, %o1 st %o4, [%o3] inc 4, %o3 deccc 64 + 4, %o2 ! See if can copy a 64 byte chunk. fwd_word_test_alignment: bneg,a fwd_small_word_copy inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot). btst 7, %o1 beq,a fwd_doubleword_chunk nop fwd_word_chunk: ld [%o1 + 0], %o4 ! Copy a 64 byte chunk. ld [%o1 + 4], %o5 ld [%o1 + 8], %g2 ld [%o1 + 12], %g3 std %o4, [%o3 + 0] std %g2, [%o3 + 8] ld [%o1 + 16], %o4 ld [%o1 + 20], %o5 ld [%o1 + 24], %g2 ld [%o1 + 28], %g3 std %o4, [%o3 + 16] std %g2, [%o3 + 24] ld [%o1 + 32], %o4 ld [%o1 + 36], %o5 ld [%o1 + 40], %g2 ld [%o1 + 44], %g3 std %o4, [%o3 + 32] std %g2, [%o3 + 40] ld [%o1 + 48], %o4 ld [%o1 + 52], %o5 ld [%o1 + 56], %g2 ld [%o1 + 60], %g3 std %o4, [%o3 + 48] std %g2, [%o3 + 56] deccc 64, %o2 inc 64, %o1 bpos fwd_word_chunk inc 64, %o3 ! Delay slot. ba fwd_small_word_copy inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot). fwd_doubleword_chunk: ldd [%o1 + 0], %o4 ! Copy a 64 byte chunk. ldd [%o1 + 8], %g2 std %o4, [%o3 + 0] std %g2, [%o3 + 8] ldd [%o1 + 16], %o4 ldd [%o1 + 24], %g2 std %o4, [%o3 + 16] std %g2, [%o3 + 24] ldd [%o1 + 32], %o4 ldd [%o1 + 40], %g2 std %o4, [%o3 + 32] std %g2, [%o3 + 40] ldd [%o1 + 48], %o4 ldd [%o1 + 56], %g2 std %o4, [%o3 + 48] std %g2, [%o3 + 56] deccc 64, %o2 inc 64, %o1 bpos fwd_doubleword_chunk inc 64, %o3 ! Delay slot. inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk. fwd_small_word_copy: bneg,a fwd_last_copy_tested inccc 8, %o2 ! See if can copy next byte (delay slot). fwd_small_word_chunk: ld [%o1 + 0], %o4 ! Copy an 8 byte chunk. ld [%o1 + 4], %o5 deccc 8, %o2 std %o4, [%o3] inc 8, %o1 bpos fwd_small_word_chunk inc 8, %o3 ! Delay slot. inc 8, %o2 fwd_last_copy: tst %o2 ! See if can copy next byte. fwd_last_copy_tested: ble copy_done fwd_last_chunk: deccc 2, %o2 ! Delay slot. ldub [%o1 + 0], %o4 ! Copy byte. bneg copy_done stb %o4, [%o3 + 0] ! Delay slot. ldub [%o1 + 1], %o4 ! Copy byte. inc 2, %o1 stb %o4, [%o3 + 1] bne,a fwd_last_chunk inc 2, %o3 ! Delay slot. retl ! Return value will still be in %o0. nop fwd_small_byte_copy: deccc 8, %o2 ! See if can fill an 8 byte chunk. bneg,a fwd_last_copy_tested inccc 8, %o2 ! See if can copy next byte (delay slot). fwd_small_byte_chunk: ld [%o1 + 0], %o4 ! Copy an 8 byte chunk. ld [%o1 + 4], %o5 srl %o4, 24, %g2 srl %o4, 16, %g3 stb %g2, [%o3 + 0] srl %o4, 8, %g2 stb %g3, [%o3 + 1] stb %g2, [%o3 + 2] stb %o4, [%o3 + 3] srl %o5, 24, %g2 srl %o5, 16, %g3 stb %g2, [%o3 + 4] srl %o5, 8, %g2 stb %g3, [%o3 + 5] stb %g2, [%o3 + 6] stb %o5, [%o3 + 7] inc 8, %o1 ba fwd_small_byte_copy inc 8, %o3 ! Delay slot. fwd_byte_1_copy: ld [%o1], %o4 ! Load first 4 bytes. srl %o4, 24, %g2 srl %o4, 8, %g3 sll %o4, 24, %g1 ! Last byte in top part of %g1. stb %g2, [%o3 + 0] ! Store first 3 bytes. sth %g3, [%o3 + 1] inc 4, %o1 inc 4, %o3 fwd_byte_1_chunk: ld [%o1 + 0], %o4 ! Copy a 32 byte chunk. ld [%o1 + 4], %o5 deccc 32, %o2 srl %o4, 8, %g2 ! First 3 bytes in %g2. or %g2, %g1, %g2 ! Add top byte in %g1 from previous cycle. sll %o4, 24, %o4 ! Fourth byte in high part of %o4. srl %o5, 8, %g3 ! Next 3 bytes in low part of %g3. or %o4, %g3, %g3 ! Add fourth byte from %o4. sll %o5, 24, %g1 ! Save last byte in %g1 for next cycle. ld [%o1 + 8], %o4 ld [%o1 + 12], %o5 std %g2, [%o3 + 0 - 1] srl %o4, 8, %g2 or %g2, %g1, %g2 sll %o4, 24, %o4 srl %o5, 8, %g3 or %o4, %g3, %g3 sll %o5, 24, %g1 ld [%o1 + 16], %o4 ld [%o1 + 20], %o5 std %g2, [%o3 + 8 - 1] srl %o4, 8, %g2 or %g2, %g1, %g2 sll %o4, 24, %o4 srl %o5, 8, %g3 or %o4, %g3, %g3 sll %o5, 24, %g1 ld [%o1 + 24], %o4 ld [%o1 + 28], %o5 std %g2, [%o3 + 16 - 1] srl %o4, 8, %g2 or %g2, %g1, %g2 sll %o4, 24, %o4 srl %o5, 8, %g3 or %o4, %g3, %g3 sll %o5, 24, %g1 std %g2, [%o3 + 24 - 1] inc 32, %o1 bpos fwd_byte_1_chunk inc 32, %o3 ! Delay slot. inc 32, %o2 srl %g1, 24, %g1 ba fwd_small_byte_copy stb %g1, [%o3 - 1] ! Store last byte (delay slot). fwd_byte_2_copy: ld [%o1], %o4 ! Load first 4 bytes. srl %o4, 16, %g3 sll %o4, 16, %g1 ! Second 2 bytes in top half of %g1. sth %g3, [%o3] ! Store first 2 bytes. inc 4, %o1 inc 4, %o3 fwd_byte_2_chunk: ld [%o1 + 0], %o4 ! Copy a 32 byte chunk. ld [%o1 + 4], %o5 deccc 32, %o2 srl %o4, 16, %g2 ! First 2 bytes in %g2. or %g2, %g1, %g2 ! Add top 2 bytes in %g1 from previous cycle. sll %o4, 16, %o4 ! Second 2 bytes in high part of %o4. srl %o5, 16, %g3 ! Third 2 bytes in low part of %g3. or %o4, %g3, %g3 ! Add second 2 bytes from %o4. sll %o5, 16, %g1 ! Save last 2 bytes in %g1 for next cycle. ld [%o1 + 8], %o4 ld [%o1 + 12], %o5 std %g2, [%o3 + 0 - 2] srl %o4, 16, %g2 or %g2, %g1, %g2 sll %o4, 16, %o4 srl %o5, 16, %g3 or %o4, %g3, %g3 sll %o5, 16, %g1 ld [%o1 + 16], %o4 ld [%o1 + 20], %o5 std %g2, [%o3 + 8 - 2] srl %o4, 16, %g2 or %g2, %g1, %g2 sll %o4, 16, %o4 srl %o5, 16, %g3 or %o4, %g3, %g3 sll %o5, 16, %g1 ld [%o1 + 24], %o4 ld [%o1 + 28], %o5 std %g2, [%o3 + 16 - 2] srl %o4, 16, %g2 or %g2, %g1, %g2 sll %o4, 16, %o4 srl %o5, 16, %g3 or %o4, %g3, %g3 sll %o5, 16, %g1 std %g2, [%o3 + 24 - 2] inc 32, %o1 bpos fwd_byte_2_chunk inc 32, %o3 ! Delay slot. inc 32, %o2 srl %g1, 16, %g1 ba fwd_small_byte_copy sth %g1, [%o3 - 2] ! Store last 2 bytes (delay slot). fwd_byte_3_copy: ld [%o1], %o4 ! Load first 4 bytes. srl %o4, 24, %g2 sll %o4, 8, %g1 ! Last 3 bytes in top part of %g1. stb %g2, [%o3 + 0] ! Store first byte. inc 4, %o1 inc 4, %o3 fwd_byte_3_chunk: ld [%o1 + 0], %o4 ! Copy a 32 byte chunk. ld [%o1 + 4], %o5 deccc 32, %o2 srl %o4, 24, %g2 ! First byte in %g2. or %g2, %g1, %g2 ! Add top 3 bytes in %g1 from previous cycle. sll %o4, 8, %o4 ! Next 3 bytes in high part of %o4. srl %o5, 24, %g3 ! Next byte in low part of %g3. or %o4, %g3, %g3 ! Add 3 bytes from %o4. sll %o5, 8, %g1 ! Save last 3 bytes in %g1 for next cycle. ld [%o1 + 8], %o4 ld [%o1 + 12], %o5 std %g2, [%o3 + 0 - 3] srl %o4, 24, %g2 or %g2, %g1, %g2 sll %o4, 8, %o4 srl %o5, 24, %g3 or %o4, %g3, %g3 sll %o5, 8, %g1 ld [%o1 + 16], %o4 ld [%o1 + 20], %o5 std %g2, [%o3 + 8 - 3] srl %o4, 24, %g2 or %g2, %g1, %g2 sll %o4, 8, %o4 srl %o5, 24, %g3 or %o4, %g3, %g3 sll %o5, 8, %g1 ld [%o1 + 24], %o4 ld [%o1 + 28], %o5 std %g2, [%o3 + 16 - 3] srl %o4, 24, %g2 or %g2, %g1, %g2 sll %o4, 8, %o4 srl %o5, 24, %g3 or %o4, %g3, %g3 sll %o5, 8, %g1 std %g2, [%o3 + 24 - 3] inc 32, %o1 bpos fwd_byte_3_chunk inc 32, %o3 ! Delay slot. inc 32, %o2 srl %g1, 16, %g2 srl %g1, 8, %g1 sth %g2, [%o3 - 3] ! Store last 3 bytes. ba fwd_small_byte_copy stb %g1, [%o3 - 1] ! Delay slot. bwd_copy: add %o1, %o2, %o1 ! Start from other end. ble bwd_last_copy ! Make sure can copy first few bytes. add %o0, %o2, %o3 ! Current destination in %o3 (delay slot). ! Copy slowly until reach word boundary for source. btst 3, %o1 ! Delay slot. be bwd_third_copy btst 1, %o1 ! Delay slot. be,a bwd_second_copy lduh [%o1 - 2], %o4 ! Delay slot. ldub [%o1 - 1], %o4 ! Copy byte. dec %o1 stb %o4, [%o3 - 1] dec %o3 btst 3, %o1 be bwd_third_copy dec %o2 ! Delay slot. lduh [%o1 - 2], %o4 ! Copy half word. bwd_second_copy: dec 2, %o1 srl %o4, 8, %o5 stb %o4, [%o3 - 1] stb %o5, [%o3 - 2] dec 2, %o3 dec 2, %o2 bwd_third_copy: btst 3, %o3 beq bwd_word_copy ! Destination mod 4 = 0. btst 4, %o3 ! Delay slot. ! Copy next word if destination is in high half of a double word. beq,a bwd_test_alignment deccc 32 + 4, %o2 ! Delay slot. ld [%o1 - 4], %o4 ! Copy word. dec 4, %o1 srl %o4, 8, %o5 srl %o4, 16, %g2 srl %o4, 24, %g3 stb %o4, [%o3 - 1] stb %o5, [%o3 - 2] stb %g2, [%o3 - 3] stb %g3, [%o3 - 4] dec 4, %o3 deccc 32 + 4 + 4, %o2 ! See if can copy a 32 + 4 byte chunk. bwd_test_alignment: bneg,a bwd_small_byte_copy inc 32 + 4, %o2 ! Delay slot. ! Call appropriate routine according to alignment of dest. btst 1, %o3 beq bwd_byte_2_copy ! Destination mod 4 = 2. btst 2, %o3 beq,a bwd_byte_1_copy ! Destination mod 4 = 1. nop ba,a bwd_byte_3_copy ! Destination mod 4 = 3. bwd_word_copy: ! Copy slowly until destination is double word aligned. beq,a bwd_word_test_alignment deccc 64, %o2 ! See if can copy a 64 byte chunk (delay slot). ld [%o1 - 4], %o4 ! Copy word. dec 4, %o1 st %o4, [%o3 - 4] dec 4, %o3 deccc 64 + 4, %o2 ! See if can copy a 64 byte chunk. bwd_word_test_alignment: bneg,a bwd_small_word_copy inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot). btst 7, %o1 beq,a bwd_doubleword_chunk nop bwd_word_chunk: ld [%o1 - 4], %o5 ! Copy a 64 byte chunk. ld [%o1 - 8], %o4 ld [%o1 - 12], %g3 ld [%o1 - 16], %g2 std %o4, [%o3 - 8] std %g2, [%o3 - 16] ld [%o1 - 20], %o5 ld [%o1 - 24], %o4 ld [%o1 - 28], %g3 ld [%o1 - 32], %g2 std %o4, [%o3 - 24] std %g2, [%o3 - 32] ld [%o1 - 36], %o5 ld [%o1 - 40], %o4 ld [%o1 - 44], %g3 ld [%o1 - 48], %g2 std %o4, [%o3 - 40] std %g2, [%o3 - 48] ld [%o1 - 52], %o5 ld [%o1 - 56], %o4 ld [%o1 - 60], %g3 ld [%o1 - 64], %g2 std %o4, [%o3 - 56] std %g2, [%o3 - 64] deccc 64, %o2 dec 64, %o1 bpos bwd_word_chunk dec 64, %o3 ! Delay slot. ba bwd_small_word_copy inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot). bwd_doubleword_chunk: ldd [%o1 - 8], %o4 ! Copy a 64 byte chunk. ldd [%o1 - 16], %g2 std %o4, [%o3 - 8] std %g2, [%o3 - 16] ldd [%o1 - 24], %o4 ldd [%o1 - 32], %g2 std %o4, [%o3 - 24] std %g2, [%o3 - 32] ldd [%o1 - 40], %o4 ldd [%o1 - 48], %g2 std %o4, [%o3 - 40] std %g2, [%o3 - 48] ldd [%o1 - 56], %o4 ldd [%o1 - 64], %g2 std %o4, [%o3 - 56] std %g2, [%o3 - 64] deccc 64, %o2 dec 64, %o1 bpos bwd_doubleword_chunk dec 64, %o3 ! Delay slot. inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk. bwd_small_word_copy: bneg,a bwd_last_copy_tested inccc 8, %o2 ! See if can copy next byte (delay slot). bwd_small_word_chunk: ld [%o1 - 4], %o5 ! Copy an 8 byte chunk. ld [%o1 - 8], %o4 deccc 8, %o2 std %o4, [%o3 - 8] dec 8, %o1 bpos bwd_small_word_chunk dec 8, %o3 ! Delay slot. inc 8, %o2 bwd_last_copy: tst %o2 ! See if can copy next byte. bwd_last_copy_tested: ble copy_done bwd_last_chunk: deccc 2, %o2 ! Delay slot. ldub [%o1 - 1], %o4 ! Copy byte. bneg copy_done stb %o4, [%o3 - 1] ! Delay slot. ldub [%o1 - 2], %o4 ! Copy byte. dec 2, %o1 stb %o4, [%o3 - 2] bne,a bwd_last_chunk dec 2, %o3 ! Delay slot. retl ! Return value will still be in %o0. nop bwd_small_byte_copy: deccc 8, %o2 ! See if can fill an 8 byte chunk. bneg,a bwd_last_copy_tested inccc 8, %o2 ! See if can copy next byte (delay slot). bwd_small_byte_chunk: ld [%o1 - 4], %o4 ! Copy an 8 byte chunk. ld [%o1 - 8], %o5 srl %o4, 8, %g2 stb %o4, [%o3 - 1] stb %g2, [%o3 - 2] srl %o4, 16, %g2 srl %o4, 24, %g3 stb %g2, [%o3 - 3] stb %g3, [%o3 - 4] srl %o5, 8, %g2 stb %o5, [%o3 - 5] stb %g2, [%o3 - 6] srl %o5, 16, %g2 srl %o5, 24, %g3 stb %g2, [%o3 - 7] stb %g3, [%o3 - 8] dec 8, %o1 ba bwd_small_byte_copy dec 8, %o3 ! Delay slot. bwd_byte_1_copy: ld [%o1 - 4], %o5 ! Load first 4 bytes. srl %o5, 8, %g1 ! Last 3 bytes in low part of %g1. stb %o5, [%o3 - 1] ! Store first byte. dec 4, %o1 dec 4, %o3 bwd_byte_1_chunk: ld [%o1 - 4], %o5 ! Copy a 32 byte chunk. ld [%o1 - 8], %o4 deccc 32, %o2 sll %o5, 24, %g3 ! First byte in %g3. or %g3, %g1, %g3 ! Add 3 low bytes in %g1 from previous cycle. srl %o5, 8, %o5 ! Next 3 bytes in low part of %o5. sll %o4, 24, %g2 ! Next byte in high part of %g2. or %o5, %g2, %g2 ! Add low 3 bytes from %o5. srl %o4, 8, %g1 ! Save last 3 bytes in %g1 for next cycle. ld [%o1 - 12], %o5 ld [%o1 - 16], %o4 std %g2, [%o3 - 8 + 3] sll %o5, 24, %g3 or %g3, %g1, %g3 srl %o5, 8, %o5 sll %o4, 24, %g2 or %o5, %g2, %g2 srl %o4, 8, %g1 ld [%o1 - 20], %o5 ld [%o1 - 24], %o4 std %g2, [%o3 - 16 + 3] sll %o5, 24, %g3 or %g3, %g1, %g3 srl %o5, 8, %o5 sll %o4, 24, %g2 or %o5, %g2, %g2 srl %o4, 8, %g1 ld [%o1 - 28], %o5 ld [%o1 - 32], %o4 std %g2, [%o3 - 24 + 3] sll %o5, 24, %g3 or %g3, %g1, %g3 srl %o5, 8, %o5 sll %o4, 24, %g2 or %o5, %g2, %g2 srl %o4, 8, %g1 std %g2, [%o3 - 32 + 3] dec 32, %o1 bpos bwd_byte_1_chunk dec 32, %o3 ! Delay slot. inc 32, %o2 srl %g1, 16, %g2 sth %g1, [%o3 + 1] ! Store last 3 bytes. ba bwd_small_byte_copy stb %g2, [%o3 + 0] ! Delay slot. bwd_byte_2_copy: ld [%o1 - 4], %o4 ! Load first 4 bytes. srl %o4, 16, %g1 ! Second 2 bytes in low half of %g1. sth %o4, [%o3 - 2] ! Store first 2 bytes. dec 4, %o1 dec 4, %o3 bwd_byte_2_chunk: ld [%o1 - 4], %o5 ! Copy a 32 byte chunk. ld [%o1 - 8], %o4 deccc 32, %o2 sll %o5, 16, %g3 ! First 2 bytes in %g3. or %g3, %g1, %g3 ! Add low 2 bytes in %g1 from previous cycle. srl %o5, 16, %o5 ! Second 2 bytes in low part of %o5. sll %o4, 16, %g2 ! Third 2 bytes in high part of %g2. or %o5, %g2, %g2 ! Add second 2 bytes from %o5. srl %o4, 16, %g1 ! Save last 2 bytes in %g1 for next cycle. ld [%o1 - 12], %o5 ld [%o1 - 16], %o4 std %g2, [%o3 - 8 + 2] sll %o5, 16, %g3 or %g3, %g1, %g3 srl %o5, 16, %o5 sll %o4, 16, %g2 or %o5, %g2, %g2 srl %o4, 16, %g1 ld [%o1 - 20], %o5 ld [%o1 - 24], %o4 std %g2, [%o3 - 16 + 2] sll %o5, 16, %g3 or %g3, %g1, %g3 srl %o5, 16, %o5 sll %o4, 16, %g2 or %o5, %g2, %g2 srl %o4, 16, %g1 ld [%o1 - 28], %o5 ld [%o1 - 32], %o4 std %g2, [%o3 - 24 + 2] sll %o5, 16, %g3 or %g3, %g1, %g3 srl %o5, 16, %o5 sll %o4, 16, %g2 or %o5, %g2, %g2 srl %o4, 16, %g1 std %g2, [%o3 - 32 + 2] dec 32, %o1 bpos bwd_byte_2_chunk dec 32, %o3 ! Delay slot. inc 32, %o2 ba bwd_small_byte_copy sth %g1, [%o3 + 0] ! Store last 2 bytes (delay slot). bwd_byte_3_copy: ld [%o1 - 4], %o4 ! Load first 4 bytes. srl %o4, 8, %g3 srl %o4, 24, %g1 ! Last byte in low part of %g1. stb %o4, [%o3 - 1] ! Store first 3 bytes. sth %g3, [%o3 - 3] dec 4, %o1 dec 4, %o3 bwd_byte_3_chunk: ld [%o1 - 4], %o5 ! Copy a 32 byte chunk. ld [%o1 - 8], %o4 deccc 32, %o2 sll %o5, 8, %g3 ! First 3 bytes in %g3. or %g3, %g1, %g3 ! Add low byte in %g1 from previous cycle. srl %o5, 24, %o5 ! Fourth byte in low part of %o5. sll %o4, 8, %g2 ! Next 3 bytes in high part of %g2. or %o5, %g2, %g2 ! Add low byte from %o5. srl %o4, 24, %g1 ! Save last byte in %g1 for next cycle. ld [%o1 - 12], %o5 ld [%o1 - 16], %o4 std %g2, [%o3 - 8 + 1] sll %o5, 8, %g3 or %g3, %g1, %g3 srl %o5, 24, %o5 sll %o4, 8, %g2 or %o5, %g2, %g2 srl %o4, 24, %g1 ld [%o1 - 20], %o5 ld [%o1 - 24], %o4 std %g2, [%o3 - 16 + 1] sll %o5, 8, %g3 or %g3, %g1, %g3 srl %o5, 24, %o5 sll %o4, 8, %g2 or %o5, %g2, %g2 srl %o4, 24, %g1 ld [%o1 - 28], %o5 ld [%o1 - 32], %o4 std %g2, [%o3 - 24 + 1] sll %o5, 8, %g3 or %g3, %g1, %g3 srl %o5, 24, %o5 sll %o4, 8, %g2 or %o5, %g2, %g2 srl %o4, 24, %g1 std %g2, [%o3 - 32 + 1] dec 32, %o1 bpos bwd_byte_3_chunk dec 32, %o3 ! Delay slot. inc 32, %o2 ba bwd_small_byte_copy stb %g1, [%o3 + 0] ! Store last byte (delay slot). copy_done: retl ! Return value will still be in %o0. nop