From 20439bb3975c34a24c3c337a1f231fbf973a41e8 Mon Sep 17 00:00:00 2001 From: Pavel Kirienko Date: Sun, 11 Jan 2015 04:35:03 +0300 Subject: [PATCH] Experimental optimization of the bit copy algorithm with special cases --- .../include/uavcan/marshal/bit_stream.hpp | 28 ++++ libuavcan/src/marshal/uc_bit_array_copy.cpp | 158 +++++++++++++++++- 2 files changed, 181 insertions(+), 5 deletions(-) diff --git a/libuavcan/include/uavcan/marshal/bit_stream.hpp b/libuavcan/include/uavcan/marshal/bit_stream.hpp index 35586390f5..413893e627 100644 --- a/libuavcan/include/uavcan/marshal/bit_stream.hpp +++ b/libuavcan/include/uavcan/marshal/bit_stream.hpp @@ -12,6 +12,8 @@ namespace uavcan { + +#if UAVCAN_TINY /** * This function implements fast copy of unaligned bit arrays. It isn't part of the library API, so it is not exported. * @param src_org Source array @@ -22,6 +24,16 @@ namespace uavcan */ void bitarrayCopy(const unsigned char* src_org, unsigned src_offset, unsigned src_len, unsigned char* dst_org, unsigned dst_offset); +#else +/** + * Special cases of @ref bitarrayCopy() - either source or destination must be aligned. + * These functions aren't part of the library API, so they are not exported. + */ +void bitarrayCopyAlignedToUnaligned(const unsigned char* src_org, unsigned src_len, + unsigned char* dst_org, unsigned dst_offset); +void bitarrayCopyUnalignedToAligned(const unsigned char* src_org, unsigned src_offset, unsigned src_len, + unsigned char* dst_org); +#endif /** * This class treats a chunk of memory as an array of bits. @@ -37,6 +49,7 @@ class UAVCAN_EXPORT BitStream static inline unsigned bitlenToBytelen(unsigned bits) { return (bits + 7) / 8; } +#if UAVCAN_TINY static inline void copyBitArrayAlignedToUnaligned(const uint8_t* src_org, unsigned src_len, uint8_t* dst_org, unsigned dst_offset) { @@ -50,6 +63,21 @@ class UAVCAN_EXPORT BitStream bitarrayCopy(reinterpret_cast(src_org), src_offset, src_len, reinterpret_cast(dst_org), 0); } +#else + static inline void copyBitArrayAlignedToUnaligned(const uint8_t* src_org, unsigned src_len, + uint8_t* dst_org, unsigned dst_offset) + { + bitarrayCopyAlignedToUnaligned(reinterpret_cast(src_org), src_len, + reinterpret_cast(dst_org), dst_offset); + } + + static inline void copyBitArrayUnalignedToAligned(const uint8_t* src_org, unsigned src_offset, unsigned src_len, + uint8_t* dst_org) + { + bitarrayCopyUnalignedToAligned(reinterpret_cast(src_org), src_offset, src_len, + reinterpret_cast(dst_org)); + } +#endif public: static const unsigned MaxBitsPerRW = MaxBytesPerRW * 8; diff --git a/libuavcan/src/marshal/uc_bit_array_copy.cpp b/libuavcan/src/marshal/uc_bit_array_copy.cpp index cdfd12c119..ae516b1f4c 100644 --- a/libuavcan/src/marshal/uc_bit_array_copy.cpp +++ b/libuavcan/src/marshal/uc_bit_array_copy.cpp @@ -8,6 +8,13 @@ #include #include #include +namespace uavcan +{ + +static const unsigned char reverse_mask[] = { 0x55U, 0x80U, 0xC0U, 0xE0U, 0xF0U, 0xF8U, 0xFCU, 0xFEU, 0xFFU }; +static const unsigned char reverse_mask_xor[] = { 0xFFU, 0x7FU, 0x3FU, 0x1FU, 0x0FU, 0x07U, 0x03U, 0x01U, 0x00U }; + +#if UAVCAN_TINY #define PREPARE_FIRST_COPY() \ do { \ @@ -21,15 +28,10 @@ src_len = 0; \ } } while (0) -namespace uavcan -{ void bitarrayCopy(const unsigned char* src_org, unsigned src_offset, unsigned src_len, unsigned char* dst_org, unsigned dst_offset) { - static const unsigned char reverse_mask[] = { 0x55U, 0x80U, 0xC0U, 0xE0U, 0xF0U, 0xF8U, 0xFCU, 0xFEU, 0xFFU }; - static const unsigned char reverse_mask_xor[] = { 0xFFU, 0x7FU, 0x3FU, 0x1FU, 0x0FU, 0x07U, 0x03U, 0x01U, 0x00U }; - if (src_len > 0U) { const unsigned char *src = src_org + (src_offset / CHAR_BIT); @@ -118,4 +120,150 @@ void bitarrayCopy(const unsigned char* src_org, unsigned src_offset, unsigned sr } } +#else + +/* + * Functions below were manually optimized in the most horrible way. + */ + +void bitarrayCopyAlignedToUnaligned(const unsigned char* src_org, unsigned src_len, + unsigned char* dst_org, unsigned dst_offset) +{ + if (src_len > 0U) + { + unsigned char* dst = dst_org + (dst_offset / CHAR_BIT); + const unsigned dst_offset_modulo = dst_offset % CHAR_BIT; + + if (0U == dst_offset_modulo) + { + const unsigned byte_len = src_len / CHAR_BIT; + const unsigned src_len_modulo = src_len % CHAR_BIT; + + if (byte_len > 0U) + { + (void)std::memcpy(dst, src_org, byte_len); + src_org += byte_len; + dst += byte_len; + } + if (src_len_modulo > 0U) + { + *dst &= reverse_mask_xor[src_len_modulo]; + *dst |= reverse_mask[src_len_modulo] & *src_org; + } + } + else + { + const unsigned bit_diff_ls = CHAR_BIT - dst_offset_modulo; + unsigned char c = + static_cast(*src_org >> dst_offset_modulo & reverse_mask_xor[dst_offset_modulo]); + + if (src_len >= (CHAR_BIT - dst_offset_modulo)) + { + *dst &= reverse_mask[dst_offset_modulo]; + src_len -= CHAR_BIT - dst_offset_modulo; + } + else + { + *dst &= reverse_mask[dst_offset_modulo] | reverse_mask_xor[dst_offset_modulo + src_len + 1]; + c &= reverse_mask[dst_offset_modulo + src_len]; + src_len = 0; + } + + *dst++ |= c; + + int byte_len = int(src_len / CHAR_BIT); + + while (--byte_len >= 0) + { + c = static_cast(*src_org++ << bit_diff_ls); + c = static_cast(c | (*src_org >> dst_offset_modulo)); + *dst++ = c; + } + + const unsigned src_len_modulo = src_len % CHAR_BIT; + if (src_len_modulo > 0U) + { + c = static_cast(*src_org++ << bit_diff_ls); + c = static_cast(c | (*src_org >> dst_offset_modulo)); + c &= reverse_mask[src_len_modulo]; + + *dst &= reverse_mask_xor[src_len_modulo]; + *dst |= c; + } + } + } +} + +void bitarrayCopyUnalignedToAligned(const unsigned char* src_org, unsigned src_offset, unsigned src_len, + unsigned char* dst_org) +{ + if (src_len > 0U) + { + const unsigned char* src = src_org + (src_offset / CHAR_BIT); + + const unsigned src_offset_modulo = src_offset % CHAR_BIT; + + if (src_offset_modulo == 0U) + { + const unsigned byte_len = src_len / CHAR_BIT; + const unsigned src_len_modulo = src_len % CHAR_BIT; + + if (byte_len > 0U) + { + (void)std::memcpy(dst_org, src, byte_len); + src += byte_len; + dst_org += byte_len; + } + if (src_len_modulo > 0U) + { + *dst_org &= reverse_mask_xor[src_len_modulo]; + *dst_org |= reverse_mask[src_len_modulo] & *src; + } + } + else + { + const unsigned bit_diff_rs = CHAR_BIT - src_offset_modulo; + + unsigned char c = static_cast(*src++ << src_offset_modulo); + c = static_cast(c | (*src >> bit_diff_rs)); + + if (src_len >= CHAR_BIT) + { + *dst_org &= 0x55U; + src_len -= CHAR_BIT; + } + else + { + *dst_org &= 0x55U | reverse_mask_xor[src_len + 1]; + c &= reverse_mask[src_len]; + src_len = 0; + } + + *dst_org++ |= c; + + int byte_len = int(src_len / CHAR_BIT); + + while (--byte_len >= 0) + { + c = static_cast(*src++ << src_offset_modulo); + c = static_cast(c | (*src >> bit_diff_rs)); + *dst_org++ = c; + } + + const unsigned src_len_modulo = src_len % CHAR_BIT; + if (src_len_modulo > 0U) + { + c = static_cast(*src++ << src_offset_modulo); + c = static_cast(c | (*src >> bit_diff_rs)); + c &= reverse_mask[src_len_modulo]; + + *dst_org &= reverse_mask_xor[src_len_modulo]; + *dst_org |= c; + } + } + } +} + +#endif + }