walnux/libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
anjiahao 603c87977f libc/machine:Add prefixes to libc functions implemented by arch
kasan does not instrument the assembly function,
so it is checked in advance before calling.
If Kasan is not turned on, the speed and space will
be almost unchanged under compiler optimization.

Signed-off-by: anjiahao <anjiahao@xiaomi.com>
2024-10-31 18:11:16 +08:00

332 lines
8.7 KiB
ArmAsm

/***************************************************************************
* libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
***************************************************************************/
#include "libc.h"
#ifdef LIBC_BUILD_STRCPY
#ifndef __ARM_FEATURE_BTI_DEFAULT
# define __ARM_FEATURE_BTI_DEFAULT 0
#endif
#ifndef __ARM_FEATURE_PAC_DEFAULT
# define __ARM_FEATURE_PAC_DEFAULT 0
#endif
/* This strcpy borrowed some ideas from arch_strcmp.S(). */
/* Parameters and result. */
#define dst r0
#define src r1
#define result r0
/* Internal variables, or callee saved registers */
#define tmp1 r4
#define tmp2 r5
#define tmp3 r6
#define src_offset r7
#ifdef __ARM_BIG_ENDIAN
# define MASK_0 0xff000000
# define MASK_1 0xff0000
# define MASK_2 0xff00
# define MASK_3 0xff
# define BYTE_0_SHIFT 24
# define BYTE_1_SHIFT 16
# define BYTE_2_SHIFT 8
# define BYTE_3_SHIFT 0
#else
# define MASK_0 0xff
# define MASK_1 0xff00
# define MASK_2 0xff0000
# define MASK_3 0xff000000
# define BYTE_0_SHIFT 0
# define BYTE_1_SHIFT 8
# define BYTE_2_SHIFT 16
# define BYTE_3_SHIFT 24
#endif
.syntax unified
.text
.align 2
.global ARCH_LIBCFUN(strcpy)
.thumb
.type ARCH_LIBCFUN(strcpy), %function
ARCH_LIBCFUN(strcpy):
#if __ARM_FEATURE_PAC_DEFAULT
# if __ARM_FEATURE_BTI_DEFAULT
pacbti ip, lr, sp
# else
pac ip, lr, sp
# endif /* __ARM_FEATURE_BTI_DEFAULT */
#endif /* __ARM_FEATURE_PAC_DEFAULT */
push {result, tmp1, tmp2, tmp3, src_offset}
eor tmp1, dst, src
tst tmp1, #3
/* If dst and src not at same byte offset from a word boundary */
bne .Lstrs_diff_offset
/* Process same byte offset then, get the offset */
ands tmp1, src, #3
beq .Ldst_src_aligned
/* get number of bytes unaligned */
rsb tmp1, #4
.Lbyte_copy_until_dsr_src_aligned:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
subs tmp1, #1
bne .Lbyte_copy_until_dsr_src_aligned
.Ldst_src_aligned:
/* Now dst and src are aligned */
ldr tmp1, [src], #4
sub tmp2, tmp1, #0x01010101
bic tmp2, tmp1
tst tmp2, #0x80808080
/* All zero means no zero byte is detected */
it eq
streq tmp1, [dst], #4
beq .Ldst_src_aligned
/* There is a zero in the word, copy until zero */
sub src, #4
.Lbyte_copy_until_zero:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
b .Lbyte_copy_until_zero
/* Make dst aligned, so we won't write anything before dst.
* If we attempt to write before dst, atomic read-write must
* be ensured. Atomic operation complicates things.
* So the solution here is byte by byte copy until dst aligned.
*/
.Lstrs_diff_offset:
ands tmp1, dst, #3
beq .Ldiff_offset_loop_begin
/* get number of dst bytes unaligned */
rsb tmp1, #4
.Lbyte_copy_until_dst_aligned:
ldrb tmp2, [src], #1
cmp tmp2, #0
beq .Lcopy_done
strb tmp2, [dst], #1
subs tmp1, #1
bne .Lbyte_copy_until_dst_aligned
.Ldiff_offset_loop_begin:
/* src_offset mustn't be 0 here */
and src_offset, src, 3
lsls src_offset, #3
bic src, #3
/* first word logic
* prepend 0xff to make the algorithm simpler
* only the first word needs to be prepended
*/
ldr tmp1, [src], #4
mov tmp2, #0xffffffff
rsb tmp3, src_offset, #32
#ifdef __ARM_BIG_ENDIAN
lsls tmp2, tmp3
#else
lsrs tmp2, tmp3
#endif
orr tmp1, tmp1, tmp2
/* Test if the first word contains zero */
sub tmp3, tmp1, #0x01010101
bic tmp3, tmp1
tst tmp3, #0x80808080
/* non-zero means zero byte is detected */
bne .Ltail_copy
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
mov tmp2, tmp1
.Ldiff_offset_loop:
mov tmp1, tmp2
ldr tmp2, [src], #4
/* Test if contains zero */
sub tmp3, tmp2, #0x01010101
bic tmp3, tmp2
tst tmp3, #0x80808080
/* non-zero means zero byte is detected */
bne .Ltail_copy
/* Now let's fill dst */
#ifdef __ARM_BIG_ENDIAN
lsls tmp1, src_offset
rsb tmp3, src_offset, #32
lsrs tmp3, tmp2, tmp3
orr tmp1, tmp1, tmp3
#else
lsrs tmp1, src_offset
rsb tmp3, src_offset, #32
lsls tmp3, tmp2, tmp3
orr tmp1, tmp1, tmp3
#endif
str tmp1, [dst], #4
b .Ldiff_offset_loop
.Ltail_copy:
cmp src_offset, #24
beq .Loffset_3
cmp src_offset, #16
beq .Loffset_2
/* src_offset == 8 here */
ands tmp3, tmp1, MASK_1
beq .Lcopy_done
lsrs tmp3, BYTE_1_SHIFT
strb tmp3, [dst], #1
.Loffset_2:
ands tmp3, tmp1, MASK_2
beq .Lcopy_done
lsrs tmp3, BYTE_2_SHIFT
strb tmp3, [dst], #1
.Loffset_3:
ands tmp3, tmp1, MASK_3
beq .Lcopy_done
lsrs tmp3, BYTE_3_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_0
beq .Lcopy_done
lsrs tmp3, BYTE_0_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_1
beq .Lcopy_done
lsrs tmp3, BYTE_1_SHIFT
strb tmp3, [dst], #1
ands tmp3, tmp2, MASK_2
beq .Lcopy_done
lsrs tmp3, BYTE_2_SHIFT
strb tmp3, [dst], #1
.Lcopy_done:
mov tmp3, #0
strb tmp3, [dst]
pop {result, tmp1, tmp2, tmp3, src_offset}
#if __ARM_FEATURE_PAC_DEFAULT
aut ip, lr, sp
#endif /* __ARM_FEATURE_PAC_DEFAULT */
bx lr
#if 0
/* Pseudo Code of strcpy when dst/src not at same byte offset */
/* Make dst aligned, so we won't write anything before dst.
* If we attempt to write before dst, atomic read-write must
* be ensured. Atomic operation complicates things.
* So the solution here is byte by byte copy until dst aligned.
*/
if (dst & 3 == 0)
goto diff_offset_loop_begin;
ByteCopyUntilDstAligned();
.diff_offset_loop_begin:
/* src_offset mustn't be 0 here */
src_offset = src & 3;
src_offset = src_offset * 8;
src = src & 0xfffffffc;
tmp1 = *src;
src +=4;
/* first word logic
* prepend 0xff to make the algorithm simpler
* only the first word needs to be prepended
*/
if (src_offset != 0)
{
tmp2 = 0xffffffff
#if big endian
tmp2 = tmp2 << (32 - src_offset)
#else
tmp2 = tmp2 >> (32 - src_offset)
#endif
tmp1 |= tmp2
}
if (HasZeroByte(tmp1))
{
goto .tail_copy;
}
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
tmp2 = tmp1
.diff_offset_loop:
tmp1 = tmp2;
tmp2 = *src;
src += 4;
/* double word tail means we have to copy from tmp1 and tmp2 to dst */
if (HasZeroByte(tmp2))
{
goto .tail_copy;
}
/* Now let's fill dst */
#if big endian
tmp1 = tmp1 << (src_offset);
tmp1 |= tmp2 >> (32 - src_offset);
*dst = tmp1;
#else
tmp1 = tmp1 >> (src_offset);
tmp1 |= tmp2 << (32 - src_offset);
*dst = tmp1;
#endif
dst +=4;
goto .diff_offset_loop;
/* byte by byte copy at the tail */
.tail_copy:
if (src_offset == 3)
goto offset_3;
if (src_offset == 2)
goto offset_2;
/* src_offset mustn't be 0 here */
/* default src_offset == 1 */
if (tmp1 & MASK_1 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_1;
offset_2:
if (tmp1 & MASK_2 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_2;
offset_3:
if (tmp1 & MASK_3 == 0)
goto cpy_done;
*dst++ = tmp1 & MASK_3;
if (tmp2 & MASK_0 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_0;
if (tmp2 & MASK_1 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_1;
if (tmp2 & MASK_2 == 0)
goto cpy_done;
*dst++ = tmp2 & MASK_2;
/* tmp2 BYTE3 must be zero here */
.cpy_done:
*dst++ = 0;
#endif /* Pseudo code end */
#endif