编程与调试 C++ -- C++11 模板元编程 实现编译期字符串加密

反编译一个程序,我们往往通过 IDA 或者 windbg 从字符串入手。 如果我们能针对每个字符串加密,运行时解密,就能更好的把字符串信息进行隐藏。 而 C++11/C++14 的新特性 模板元编程 能实现编译期字符串加密。 并且还很好的保证了代码的可读性。

目标实现宏 OBFUSCATED,编译出来的二进制文件,IDA 逆向分析不包含字符串 Baby Hai's Secret

std::string stdstr = OBFUSCATED("Baby Hai's Secret");

模板元编程

这玩意号称图灵完备的,很厉害,能实现编译期间运算,主要用到 模板偏特化 & 编译优化。

  • 第一个版本,实现长度为 6 的字符串的编译时加密。
  • 第二个版本,实现任意长度的字符串 编译时加密。
  • 第三个版本,实现任意长度的字符串 随机 key 的编译时加密。
  • 第四个版本,三个算法,随机选择加密。

算阶乘

阶乘定义:

N! = 1 * 2 * 3 * 4 * ... * N
N! = N * (N - 1)!
#include <iostream>
#include <type_traits>

template<int N>
struct Factorial
{
    static const int value = N * Factorial<N - 1>::value;
};

template<>
struct Factorial<0>
{
    static const int value = 1;
};

int main() {
    // 编译期间 Factorial<5>::value 就算出来了。
    std::cout << "Factorial(5) = " << Factorial<5>::value << std::endl;
    return 0;
}

生成的汇编代码:

答案编译期就计算出来了,5! = 0x78 = 120

enable_if

C++14 enable_if。 编译器在类型推导的过程中,会尝试推导所有的重载函数,在此过程在过程中,如果 enable_if 条件不满足,则会在候选函数集合中剔除此函数。

#include <iostream>
#include <type_traits>

// 1. 只有当 T 是整数的时候,函数才存在。
template <class T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
is_odd(T i) { return bool(i % 2); }

// 2. 只有当 T 是整数的时候,函数才存在。
template < class T,
    class = typename std::enable_if<std::is_integral<T>::value>::type>
    bool is_even(T i) { return !bool(i % 2); }

int main() {
    short int i = 1;  // 如果不是整数类型,编译不过。
    //float i = 1; // 模板推导失败
    std::cout << "i is odd: " << is_odd(i) << std::endl;
    std::cout << "i is even: " << is_even(i) << std::endl;
    return 0;
}

第一个版本

#include <iostream>
#include <type_traits>

template<int... I> // 字符串的数组下标。
struct MetaString1
{
    // 编译期运算
    constexpr __forceinline MetaString1(const char* str)
        : m_buffer{ encrypt(str[I])... } { }

    // 运行时解密
    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i) {
            m_buffer[i] = decrypt(m_buffer[i]);
        }
        m_buffer[sizeof...(I)] = 0;
        return m_buffer;
    }

private:
    // 加密一个字符
    constexpr char encrypt(char c) const { return c ^ 0x55; }
    // 解密一个字符
    constexpr char decrypt(char c) const { return encrypt(c); }

private:
    char m_buffer[sizeof...(I) + 1];
};

#define OBFUSCATED1(str) (MetaString1<0, 1, 2, 3, 4, 5>(str).decrypt())

int main() {
    std::cout << OBFUSCATED1("123456789") << std::endl;
    std::cout << OBFUSCATED1("123456") << std::endl;
    std::cout << OBFUSCATED1("12345") << std::endl;
    return 0;
}

输出:

123456
123456
12345

很明显,只能处理 6 个字符串的情况。

简化一下代码,编译 Release 看看汇编代码。

#include <iostream>
#include <type_traits>

template<int... I> // 字符串的数组下标。
struct MetaString1
{
    // 编译期运算
    constexpr __forceinline MetaString1(const char* str)
        : m_buffer{ encrypt(str[I])... } {
    }

    // 运行时解密
    const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i) {
            m_buffer[i] = decrypt(m_buffer[i]);
        }
        m_buffer[sizeof...(I)] = 0;
        return m_buffer;
    }

private:
    // 加密一个字符
    constexpr char encrypt(char c) const { return c ^ 0x55; }
    constexpr char decrypt(char c) const { return encrypt(c); }

private:
    char m_buffer[sizeof...(I) + 1];
};

int main() {
    auto str = MetaString1<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>("0123456789");
    std::cout << "test";
    std::cout << str.decrypt() << std::endl;
    return 0;
}

编译器设置:

编译后的汇编代码(都优化成了一堆 mov 指令):

Windows 是小端存储(低位在低地址):

>>> "%c" % chr(0x66 ^ 0x55) '3'
>>> "%c" % chr(0x67 ^ 0x55) '2'
>>> "%c" % chr(0x64 ^ 0x55) '1'
>>> "%c" % chr(0x65 ^ 0x55) '0'

第二个版本

自动识别字符串长度。 使用偏特化为每个长度的字符串自动生成一个模板。

// C++14 (C++1y) 新增 std::index_sequence
// MakeIndex<N>::type 生成 Indexes<0, 1, 2, 3, ..., N>
template<int... I>
struct Indexes { using type = Indexes<I..., sizeof...(I)>; };

template<int N>
struct Make_Indexes { using type = typename Make_Indexes<N - 1>::type::type; };

template<>
struct Make_Indexes<0> { using type = Indexes<>; };

完整版本:

#include <iostream>
#include <type_traits>

// C++14 (C++1y) 新增 std::index_sequence
// MakeIndex<N>::type 生成 Indexes<0, 1, 2, 3, ..., N>
template<int... I>
struct Indexes { using type = Indexes<I..., sizeof...(I)>; };

template<int N>
struct Make_Indexes { using type = typename Make_Indexes<N - 1>::type::type; };

template<>
struct Make_Indexes<0> { using type = Indexes<>; };

template<typename Indexes>
struct MetaString2;

template<int... I>
struct MetaString2<Indexes<I...>>
{
    constexpr __forceinline MetaString2(const char* str)
        : m_buffer{ encrypt(str[I])... } { }

    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i)
            m_buffer[i] = decrypt(m_buffer[i]);
        m_buffer[sizeof...(I)] = 0;
        return m_buffer;
    }

private:
    constexpr char encrypt(char c) const { return c ^ 0x55; }
    constexpr char decrypt(char c) const { return encrypt(c); }

private:
    char m_buffer[sizeof...(I) + 1];
};

int main() {
#define cstr "123"
    auto str = MetaString2<Make_Indexes<sizeof(cstr) - 1>::type>(cstr);
    return 0;
}

第三个版本

上面的每次都是同一个密钥 0x55,能不能每次都采用不同的密钥呢。

用编译时 时间 __TIME__ 作为种子,生成编译期随机数。 宏 __COUNTER__ 是一个计数器,会从 0 开始计数,然后每次调用加 1。 __COUNTER__ 保证每个字符串 key 都不一样,__TIME__ 保证每次构建都不一样。 Predefined macros

完整的编译期随机数生成算法:

#include <iostream>
#include <type_traits>
#include <random>

namespace
{
    constexpr char time[] = __TIME__; // 24 小时格式: hh:mm:ss

    constexpr int DigitToInt(char c) { return c - '0'; }
    const int seed = DigitToInt(time[7]) + DigitToInt(time[6]) * 10 + //
        DigitToInt(time[4]) * 60 + DigitToInt(time[3]) * 600 + //
        DigitToInt(time[1]) * 3600 + DigitToInt(time[0]) * 36000;
}

// 根据 N,生成随机数 value
template<int N>
struct MetaRandomGenerator
{
private:
    static constexpr unsigned a = 16807;        // 7^5
    static constexpr unsigned m = 2147483647;   // 2^31 - 1

    static constexpr unsigned s = MetaRandomGenerator<N - 1>::value;
    static constexpr unsigned lo = a * (s & 0xFFFF); // 低 16 位乘以 16807
    static constexpr unsigned hi = a * (s >> 16);    // 高 16 位乘以 16807
    static constexpr unsigned result = lo + hi + ((hi & 0x7FFF) << 16);

public:
    static constexpr unsigned max = m;
    static constexpr unsigned value = result > m ? result - m : result;
};

template<>
struct MetaRandomGenerator<0>
{
    static constexpr unsigned value = seed;
};

template<int N, int M>
struct MetaRandom
{
    static const int value = MetaRandomGenerator<N + 1>::value % M;
};

int main() {
    // 每次构建,会是一个不同的数字。
    int v = MetaRandom<__COUNTER__, 10>::value;
    int x = MetaRandom<__COUNTER__, 10>::value;
    return 0;
}

完整代码:

template<typename Indexes, int K>
struct MetaString3;

template<int... I, int K>
struct MetaString3<Indexes<I...>, K>
{
    // buffer[0] 存储 key。
    constexpr __forceinline MetaString3(const char* str)
        : m_buffer{ static_cast<char>(K), encrypt(str[I])... } { }

    // 运行时间解密。
    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i)
            m_buffer[i + 1] = decrypt(m_buffer[i + 1]);
        m_buffer[sizeof...(I) + 1] = 0;
        return m_buffer + 1;
    }

private:
    constexpr char key() const { return m_buffer[0]; }
    constexpr char encrypt(char c) const { return c ^ key(); }
    constexpr char decrypt(char c) const { return encrypt(c); }

private:
    char m_buffer[sizeof...(I) + 2];
};

template<int N>
struct MetaRandomChar3
{
    // 不能超过 0x7F
    static const char value = static_cast<char>(1 + MetaRandom<N, 0x7F - 1>::value);
};

int main() {
#define cstr "1234"
    auto temp = MetaString3<Make_Indexes<sizeof(cstr) - 1>::type, \
                            MetaRandomChar3<__COUNTER__>::value>(cstr);
    return 0;
}

第四个版本

能不能实现多个算法,每次随机挑选一个呢。 实现三个算法,每次随机挑选一个。

模板偏特化 Template partial specialization:

template<int A, int K, typename Indexes>
struct MetaString4;

template<int K, int... I>
struct MetaString4<0, K, Indexes<I...>>
{  c ^ K  };

template<int K, int... I>
struct MetaString4<1, K, Indexes<I...>>
{  c + K  };

#define DEF_OBFUSCATED4(str) MetaString4<MetaRandom<__COUNTER__, 2>::value, …

完整实现:

// 三个参数:N - 算法,Key,Indexes - 字符串下标数组。
template<int N, char Key, typename Indexes>
struct MetaString;

第一个算法:

template<char K, int... I>
struct MetaString<0, K, Indexes<I...>>
{
    constexpr __forceinline MetaString(const char* str)
        : m_key{ K }, m_buffer{ encrypt(str[I], K)... } { }

    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i)
            m_buffer[i] = decrypt(m_buffer[i]);
        m_buffer[sizeof...(I)] = 0;
        return const_cast<const char*>(m_buffer);
    }

private:
    constexpr char key() const { return m_key; }
    constexpr char __forceinline encrypt(char c, int k) const { return c ^ k; }
    constexpr char decrypt(char c) const { return encrypt(c, key()); }

    volatile int m_key; // volatile 避免编译器过度优化。
    volatile char m_buffer[sizeof...(I) + 1];
};

第二个算法:

template<char K, int... I>
struct MetaString<1, K, Indexes<I...>>
{
    constexpr __forceinline MetaString(const char* str)
        : m_key(K), m_buffer{ encrypt(str[I], I)... } { }

    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i)
            m_buffer[i] = decrypt(m_buffer[i], i);
        m_buffer[sizeof...(I)] = 0;
        return const_cast<const char*>(m_buffer);
    }

private:
    constexpr char key(size_t position) const { return static_cast<char>(m_key + position); }
    constexpr char __forceinline encrypt(char c, size_t position) const { return c ^ key(position); }
    constexpr char decrypt(char c, size_t position) const { return encrypt(c, position); }

    volatile int m_key;
    volatile char m_buffer[sizeof...(I) + 1];
};

第三个算法:

template<char K, int... I>
struct MetaString<2, K, Indexes<I...>>
{
    constexpr __forceinline MetaString(const char* str)
        : m_buffer{ encrypt(str[I])..., 0 } { }

    inline const char* decrypt()
    {
        for (size_t i = 0; i < sizeof...(I); ++i)
            m_buffer[i] = decrypt(m_buffer[i]);
        return const_cast<const char*>(m_buffer);
    }

private:
    // key 绝对不能为 0
    constexpr char key(char key) const { return 1 + (key % 13); }
    constexpr char __forceinline encrypt(char c) const { return c + key(K); }
    constexpr char decrypt(char c) const { return c - key(K); }

    volatile char m_buffer[sizeof...(I) + 1];
};

汇总使用:

template<int N>
struct MetaRandomChar
{
    // 最大值 0x7F
    static const char value = static_cast<char>(1 + MetaRandom<N, 0x7F - 1>::value);
};

#define DEF_OBFUSCATED(str) MetaString<MetaRandom<__COUNTER__, 3>::value, \
                                       MetaRandomChar<__COUNTER__>::value, \
                                       Make_Indexes<sizeof(str) - 1>::type>(str)

#define OBFUSCATED(str) (DEF_OBFUSCATED(str).decrypt())

int main() {
    auto temp = DEF_OBFUSCATED("1234");
    auto cstr = temp.decrypt();
    return 0;
}

尾声

Debug 版本貌似字符串还存在,Release 版本就没有了。 只弄了 char 的情况,wchar 类似,不再累述。

  • 字符串直接使用的情况:
    • const char* str = OBFUSCATED("Baby Hai's Secret"); –- 拿到指针,指针对应的临时变量 MetaString 就释放了,非法用法。
    • const std::string stdstr = OBFUSCATED("Baby Hai's Secret"); –- stdstr 完成构造,对应的临时变量 MetaString 才释放,正确。
  • 定义和使用分离:
    • auto metastr = DEF_OBFUSCATED("Baby Hai's Secret");
    • const char* str = metastr.decrypt();

编译出来的二进制就反编译不到对应的字符串了,而且代码的可读性也保证了。

更多思考:最底层应该是基于汇编的优化,当一个函数是确定的,参数也是确定的,直接在编译期就可以算出结果了。

xorstr.hpp

发现一个貌似更牛的版本,等有时间了,好好学习一下。 xorstr A heavily vectorized c++17 compile time string encryption.

用 Intel AVX 并行计算。<immintrin.h> 进行了汇编优化。AVX 指令集

  • 如果是英特尔的 CPU,只要是二代或以后的 i3、i5 或者 i7 这几种,就会支持 AVX 指令集,奔腾及赛扬暂不支持。
  • 如果是 AMD 的产品,则推土机架构以后的 FX、速龙系列、APU、锐龙系列都支持。

最简单的方法是用 CPU-z 检测一下,一目了然。

启用增强指令集 允许使用支持增强指令集的处理器上的指令,例如,IA-32 的 SSE、SSE2、AVX、AVX2 和 AVX-512 增强;x64 的 AVX、AVX2 和 AVX-512 增强。 当前只有在为 x86 体系结构生成程序时,/arch:SSE 和 /arch:SSE2 才可用。如果未指定任何选项,则编译器将使用支持 SSE2 的处理器上的指令。 可以通过 /arch:lA32 禁用对增强指令的使用。(/arch:SSE、/arch:SSE2、/arch:AVX、/arch:AVX2、/arch:AVX512、/arch:lA32)

quick example

int main() {
    std::puts(XORSTR("an extra long hello_world"));
}

API

// This macro creates an encrypted xor_string string instance.
#define XORSTR_DEF(string) xor_string<...>{string}

// For convenience sake there is also a macro to instantly decrypt the string
#define XORSTR(string) XORSTR_DEF(string).decrypt()

struct xor_string<CharType, ...> {
    using size_type     = std::size_t;
    using value_type    = CharT;
    using pointer       = value_type*;
    using const_pointer = const value_type*;

    // Returns string size in characters, not including null terminator.
    constexpr size_type size() const;

    // Runs the encryption/decryption algorithm on the internal storage.
    void crypt() noexcept;

    // Returns const pointer to the storage, without doing any modifications to it.
    const_pointer get() const;

    // Returns non const pointer to the storage, without doing any modifications to it.
    pointer get();

    // Runs crypt() and returns the pointer to the internal storage.
    pointer decrypt();
}

noteworthy things

  • All keys are 64bit and generated during compile time.
  • Data blocks go in increments of 16 bytes so some space may be wasted.
  • The code has been crafted so that all the data would be embedded directly into code and not stored on .rdata and such.
  • The entirety of string encryption and decryption will be inlined.

supported compilers and platforms

  • Tested to be working on clang 5.0+, gcc 7.1+ and MSVC v141.
  • If your CPU does not support AVX define JM_XORSTR_DISABLE_AVX_INTRINSICS to only use SSE.
    • 在实际工程中,JM_XORSTR_DISABLE_AVX_INTRINSICS 我屏蔽掉了 AVX 指令集优化。

example assembly output

Output of gcc (trunk) from the quick example 生成的 AVX VMOVDQA 汇编指令集。

main:
  movabs rax, -4762152789334367252
  push rbp
  mov rbp, rsp
  and rsp, -32
  sub rsp, 64
  mov QWORD PTR [rsp], rax
  mov rdi, rsp
  movabs rax, -6534519754492314190
  mov QWORD PTR [rsp+8], rax
  movabs rax, -2862143164529545214
  mov QWORD PTR [rsp+16], rax
  movabs rax, -4140208776682645948
  mov QWORD PTR [rsp+24], rax
  vmovdqa ymm1, YMMWORD PTR [rsp]
  movabs rax, -2550414817236710003
  mov QWORD PTR [rsp+32], rax
  movabs rax, -4595755740016602734
  mov QWORD PTR [rsp+40], rax
  movabs rax, -5461194525092864914
  mov QWORD PTR [rsp+48], rax
  movabs rax, -4140208776682645984
  mov QWORD PTR [rsp+56], rax
  vpxor ymm0, ymm1, YMMWORD PTR [rsp+32]
  vmovdqa YMMWORD PTR [rsp], ymm0
  vzeroupper
  call puts
  xor eax, eax
  leave
  ret

source code

/*
 * Copyright 2017 - 2021 Justas Masiulis
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef JM_XORSTR_HPP
#define JM_XORSTR_HPP

#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
#include <arm_neon.h>
#elif defined(_M_X64) || defined(__amd64__) || defined(_M_IX86) || defined(__i386__)
#include <immintrin.h>
#else
#error Unsupported platform
#endif

#include <cstdint>
#include <cstddef>
#include <utility>
#include <type_traits>

#define XORSTR_DEF(str) ::jm::xor_string([]() { return str; }, \
                std::integral_constant<std::size_t, sizeof(str) / sizeof(*str)>{}, \
                std::make_index_sequence<::jm::detail::_buffer_size<sizeof(str)>()>{})
#define XORSTR(str) XORSTR_DEF(str).decrypt()

#ifdef _MSC_VER
#define XORSTR_FORCEINLINE __forceinline
#else
#define XORSTR_FORCEINLINE __attribute__((always_inline)) inline
#endif

namespace jm {

    namespace detail {

        template<std::size_t Size>
        XORSTR_FORCEINLINE constexpr std::size_t _buffer_size()
        {
            return ((Size / 16) + (Size % 16 != 0)) * 2;
        }

        template<std::uint32_t Seed>
        XORSTR_FORCEINLINE constexpr std::uint32_t key4() noexcept
        {
            std::uint32_t value = Seed;
            for(char c : __TIME__)
                value = static_cast<std::uint32_t>((value ^ c) * 16777619ull);
            return value;
        }

        template<std::size_t S>
        XORSTR_FORCEINLINE constexpr std::uint64_t key8()
        {
            constexpr auto first_part  = key4<2166136261 + S>();
            constexpr auto second_part = key4<first_part>();
            return (static_cast<std::uint64_t>(first_part) << 32) | second_part;
        }

        // loads up to 8 characters of string into uint64 and xors it with the key
        template<std::size_t N, class CharT>
        XORSTR_FORCEINLINE constexpr std::uint64_t
        load_xored_str8(std::uint64_t key, std::size_t idx, const CharT* str) noexcept
        {
            using cast_type = typename std::make_unsigned<CharT>::type;
            constexpr auto value_size = sizeof(CharT);
            constexpr auto idx_offset = 8 / value_size;

            std::uint64_t value = key;
            for(std::size_t i = 0; i < idx_offset && i + idx * idx_offset < N; ++i)
                value ^=
                    (std::uint64_t{ static_cast<cast_type>(str[i + idx * idx_offset]) }
                     << ((i % idx_offset) * 8 * value_size));

            return value;
        }

        // forces compiler to use registers instead of stuffing constants in rdata
        XORSTR_FORCEINLINE std::uint64_t load_from_reg(std::uint64_t value) noexcept
        {
#if defined(__clang__) || defined(__GNUC__)
            asm("" : "=r"(value) : "0"(value) :);
            return value;
#else
            volatile std::uint64_t reg = value;
            return reg;
#endif
        }

    } // namespace detail

    template<class CharT, std::size_t Size, class Keys, class Indices>
    class xor_string;

    template<class CharT, std::size_t Size, std::uint64_t... Keys, std::size_t... Indices>
    class xor_string<CharT, Size, std::integer_sequence<std::uint64_t, Keys...>, std::index_sequence<Indices...>> {
#ifndef JM_XORSTR_DISABLE_AVX_INTRINSICS
        constexpr static inline std::uint64_t alignment = ((Size > 16) ? 32 : 16);
#else
        constexpr static inline std::uint64_t alignment = 16;
#endif

        alignas(alignment) std::uint64_t _storage[sizeof...(Keys)];

    public:
        using value_type    = CharT;
        using size_type     = std::size_t;
        using pointer       = CharT*;
        using const_pointer = const CharT*;

        template<class L>
        XORSTR_FORCEINLINE xor_string(L l, std::integral_constant<std::size_t, Size>,
                std::index_sequence<Indices...>) noexcept
            : _storage{ ::jm::detail::load_from_reg(
                (std::integral_constant<std::uint64_t, detail::load_xored_str8<Size>(Keys, Indices, l())>::value))... }
        {}

        XORSTR_FORCEINLINE constexpr size_type size() const noexcept
        {
            return Size - 1;
        }

        XORSTR_FORCEINLINE void crypt() noexcept
        {
            // everything is inlined by hand because a certain compiler with a certain linker is _very_ slow
#if defined(__clang__)
            alignas(alignment)
                std::uint64_t arr[]{ ::jm::detail::load_from_reg(Keys)... };
            std::uint64_t*    keys =
                (std::uint64_t*)::jm::detail::load_from_reg((std::uint64_t)arr);
#else
            alignas(alignment) std::uint64_t keys[]{ ::jm::detail::load_from_reg(Keys)... };
#endif

#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
#if defined(__clang__)
            ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : __builtin_neon_vst1q_v(
                                    reinterpret_cast<uint64_t*>(_storage) + Indices * 2,
                                    veorq_u64(__builtin_neon_vld1q_v(reinterpret_cast<const uint64_t*>(_storage) + Indices * 2, 51),
                                              __builtin_neon_vld1q_v(reinterpret_cast<const uint64_t*>(keys) + Indices * 2, 51)),
                                    51)), ...);
#else // GCC, MSVC
            ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : vst1q_u64(
                        reinterpret_cast<uint64_t*>(_storage) + Indices * 2,
                        veorq_u64(vld1q_u64(reinterpret_cast<const uint64_t*>(_storage) + Indices * 2),
                                  vld1q_u64(reinterpret_cast<const uint64_t*>(keys) + Indices * 2)))), ...);
#endif
#elif !defined(JM_XORSTR_DISABLE_AVX_INTRINSICS)
            ((Indices >= sizeof(_storage) / 32 ? static_cast<void>(0) : _mm256_store_si256(
                reinterpret_cast<__m256i*>(_storage) + Indices,
                _mm256_xor_si256(
                    _mm256_load_si256(reinterpret_cast<const __m256i*>(_storage) + Indices),
                    _mm256_load_si256(reinterpret_cast<const __m256i*>(keys) + Indices)))), ...);

            if constexpr(sizeof(_storage) % 32 != 0)
                _mm_store_si128(
                    reinterpret_cast<__m128i*>(_storage + sizeof...(Keys) - 2),
                    _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage + sizeof...(Keys) - 2)),
                                  _mm_load_si128(reinterpret_cast<const __m128i*>(keys + sizeof...(Keys) - 2))));
#else
        ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : _mm_store_si128(
            reinterpret_cast<__m128i*>(_storage) + Indices,
            _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage) + Indices),
                          _mm_load_si128(reinterpret_cast<const __m128i*>(keys) + Indices)))), ...);
#endif
        }

        XORSTR_FORCEINLINE const_pointer get() const noexcept
        {
            return reinterpret_cast<const_pointer>(_storage);
        }

        XORSTR_FORCEINLINE pointer get() noexcept
        {
            return reinterpret_cast<pointer>(_storage);
        }

        XORSTR_FORCEINLINE pointer decrypt() noexcept
        {
            // crypt() is inlined by hand because a certain compiler with a certain linker is _very_ slow
#if defined(__clang__)
            alignas(alignment)
                std::uint64_t arr[]{ ::jm::detail::load_from_reg(Keys)... };
            std::uint64_t*    keys =
                (std::uint64_t*)::jm::detail::load_from_reg((std::uint64_t)arr);
#else
            alignas(alignment) std::uint64_t keys[]{ ::jm::detail::load_from_reg(Keys)... };
#endif

#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM) || defined(__arm__)
#if defined(__clang__)
            ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : __builtin_neon_vst1q_v(
                                    reinterpret_cast<uint64_t*>(_storage) + Indices * 2,
                                    veorq_u64(__builtin_neon_vld1q_v(reinterpret_cast<const uint64_t*>(_storage) + Indices * 2, 51),
                                              __builtin_neon_vld1q_v(reinterpret_cast<const uint64_t*>(keys) + Indices * 2, 51)),
                                    51)), ...);
#else // GCC, MSVC
            ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : vst1q_u64(
                        reinterpret_cast<uint64_t*>(_storage) + Indices * 2,
                        veorq_u64(vld1q_u64(reinterpret_cast<const uint64_t*>(_storage) + Indices * 2),
                                  vld1q_u64(reinterpret_cast<const uint64_t*>(keys) + Indices * 2)))), ...);
#endif
#elif !defined(JM_XORSTR_DISABLE_AVX_INTRINSICS)
            ((Indices >= sizeof(_storage) / 32 ? static_cast<void>(0) : _mm256_store_si256(
                reinterpret_cast<__m256i*>(_storage) + Indices,
                _mm256_xor_si256(
                    _mm256_load_si256(reinterpret_cast<const __m256i*>(_storage) + Indices),
                    _mm256_load_si256(reinterpret_cast<const __m256i*>(keys) + Indices)))), ...);

            if constexpr(sizeof(_storage) % 32 != 0)
                _mm_store_si128(
                    reinterpret_cast<__m128i*>(_storage + sizeof...(Keys) - 2),
                    _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage + sizeof...(Keys) - 2)),
                                  _mm_load_si128(reinterpret_cast<const __m128i*>(keys + sizeof...(Keys) - 2))));
#else
        ((Indices >= sizeof(_storage) / 16 ? static_cast<void>(0) : _mm_store_si128(
            reinterpret_cast<__m128i*>(_storage) + Indices,
            _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage) + Indices),
                          _mm_load_si128(reinterpret_cast<const __m128i*>(keys) + Indices)))), ...);
#endif

            return (pointer)(_storage);
        }
    };

    template<class L, std::size_t Size, std::size_t... Indices>
    xor_string(L l, std::integral_constant<std::size_t, Size>, std::index_sequence<Indices...>) -> xor_string<
                std::remove_const_t<std::remove_reference_t<decltype(l()[0])>>,
                Size,
                std::integer_sequence<std::uint64_t, detail::key8<Indices>()...>,
                std::index_sequence<Indices...>>;

} // namespace jm

#endif // include guard

精简一下,便于阅读理解。

#ifndef JM_XORSTR_HPP
#define JM_XORSTR_HPP

#include <immintrin.h>
#include <cstdint>
#include <cstddef>
#include <utility>
#include <type_traits>

namespace jm {

namespace detail {

template <std::size_t Size>
__forceinline constexpr std::size_t _buffer_size() {
    return ((Size / 16) + (Size % 16 != 0)) * 2;
}

template <std::uint32_t Seed>
__forceinline constexpr std::uint32_t key4() noexcept {
    std::uint32_t value = Seed;
    for (char c : __TIME__) { // FNVHash
        // Rabin-Karp 算法中的素数
        // FNV_prime 值为 2^24 + 2^8 + 0x93 = 16777619
        value = static_cast<std::uint32_t>((value ^ c) * 16777619ull);
    }
    return value;
}

template <std::size_t S>
__forceinline constexpr std::uint64_t key8() {
    // 32 位的 offset_basis 值为 2166136261=0x811c9dc5
    constexpr auto first_part = key4<2166136261 + S>();
    constexpr auto second_part = key4<first_part>();
    return (static_cast<std::uint64_t>(first_part) << 32) | second_part;
}

// loads up to 8 characters of string into uint64 and xors it with the key
// 每次 64 位,64 位 的 算,速度优化。
template <std::size_t N, class CharT>
__forceinline constexpr std::uint64_t load_xored_str8(std::uint64_t key,
                std::size_t idx, const CharT* str) noexcept {
    using cast_type = typename std::make_unsigned<CharT>::type;
    // char: value_size=1, idx_offset=8
    // wchar: value_size=2, idx_offset=4
    constexpr auto value_size = sizeof(CharT);
    constexpr auto block_size = 8 / value_size; // 每次计算 64 位,就是 8 字节

    std::uint64_t value = key;
    for (std::size_t i = 0; i < block_size && idx * block_size + i < N; ++i) {
        auto offset = i * 8 * value_size;
        auto p = static_cast<cast_type>(str[idx * block_size + i]);
        value ^= std::uint64_t{p} << offset;
    }
    return value;
}

// forces compiler to use registers instead of stuffing constants in rdata
// 强制使用 寄存器,避免常量在 pe 的 rdata 里面。
__forceinline std::uint64_t load_from_reg(std::uint64_t value) noexcept {
    volatile std::uint64_t reg = value;
    return reg;
}

}; // namespace detail

}; // namespace jm

namespace jm {

template <class CharT, std::size_t Size, class Keys, class Indices>
class xor_string;

template <class CharT, std::size_t Size, std::uint64_t... Keys, std::size_t... Indices>
// std::integer_sequence: a sequence 0, 1, 2, ..., N-1
class xor_string<CharT, Size,
                std::integer_sequence<std::uint64_t, Keys...>,
                std::index_sequence<Indices...>> {
    constexpr static inline std::uint64_t alignment = ((Size > 16) ? 32 : 16);
    alignas(alignment) std::uint64_t _storage[sizeof...(Keys)];

  public:
    using value_type = CharT;
    using size_type = std::size_t;
    using pointer = CharT*;
    using const_pointer = const CharT*;

    // std::integral_constant 包装特定类型的静态常量
    // L l -- 是一个 lambda 函数。
    template <class L>
    __forceinline xor_string(L l,
                    std::integral_constant<std::size_t, Size>,
                    std::index_sequence<Indices...>) noexcept
        : _storage{::jm::detail::load_from_reg(
              (std::integral_constant<std::uint64_t,
                    detail::load_xored_str8<Size>(Keys, Indices, l())>::value))...} {
    }

    // 字符串长度。
    __forceinline constexpr size_type size() const noexcept {
        return Size - 1;
    }

    __forceinline void crypt() noexcept {
        // everything is inlined by hand because a certain compiler with a certain linker is _very_ slow
        alignas(alignment) std::uint64_t keys[]{::jm::detail::load_from_reg(Keys)...};

        /**
         * Intel AVX 并行计算。<immintrin.h>
         * _mm256_load_si256 / _mm256_store_si256 -- Move Aligned Packed Integer Values
         * **** VMOVDQA ymm1, m256
         * **** VMOVDQA m256, ymm1
         * Moves 256 bits of packed integer values from the source operand to the destination
         */
        ((Indices >= sizeof(_storage) / 32
              ? static_cast<void>(0)
              : _mm256_store_si256(
                    reinterpret_cast<__m256i*>(_storage) + Indices,
                    _mm256_xor_si256(_mm256_load_si256(reinterpret_cast<const __m256i*>(_storage) + Indices),
                                     _mm256_load_si256(reinterpret_cast<const __m256i*>(keys) + Indices)))),
         ...);

        if constexpr (sizeof(_storage) % 32 != 0) { // 多出来的最后 16 位 单独处理一下。
            _mm_store_si128(
                reinterpret_cast<__m128i*>(_storage + sizeof...(Keys) - 2),
                _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage + sizeof...(Keys) - 2)),
                              _mm_load_si128(reinterpret_cast<const __m128i*>(keys + sizeof...(Keys) - 2))));
        }
    }

    __forceinline const_pointer get() const noexcept {
        return reinterpret_cast<const_pointer>(_storage);
    }

    __forceinline pointer get() noexcept {
        return reinterpret_cast<pointer>(_storage);
    }

    __forceinline pointer decrypt() noexcept {
        // crypt() is inlined by hand because a certain compiler with a certain linker is _very_ slow
        alignas(alignment) std::uint64_t keys[]{::jm::detail::load_from_reg(Keys)...};

        // Intel AVX 并行计算。<immintrin.h>
        ((Indices >= sizeof(_storage) / 32
              ? static_cast<void>(0)
              : _mm256_store_si256(
                    reinterpret_cast<__m256i*>(_storage) + Indices,
                    _mm256_xor_si256(_mm256_load_si256(reinterpret_cast<const __m256i*>(_storage) + Indices),
                                     _mm256_load_si256(reinterpret_cast<const __m256i*>(keys) + Indices)))),
         ...);

        if constexpr (sizeof(_storage) % 32 != 0) {
            _mm_store_si128(
                reinterpret_cast<__m128i*>(_storage + sizeof...(Keys) - 2),
                _mm_xor_si128(_mm_load_si128(reinterpret_cast<const __m128i*>(_storage + sizeof...(Keys) - 2)),
                              _mm_load_si128(reinterpret_cast<const __m128i*>(keys + sizeof...(Keys) - 2))));
        }
        return (pointer)(_storage);
    }
};

// 引用移除 : remove_reference
// 若类型 T 为引用类型,则提供成员 typedef type ,其为 T 所引用的类型。否则 type 为 T 。
// std::remove_const_t<std::remove_reference_t<decltype(l()[0])>> -- 取第一个字符的类型。
// std::integer_sequence<std::uint64_t, detail::key8<Indices>()...> -- xor key 序列。
// std::index_sequence<Indices...> -- 下标数组。
template <class L, std::size_t Size, std::size_t... Indices>
xor_string(L l, std::integral_constant<std::size_t, Size>, std::index_sequence<Indices...>)
    -> xor_string<std::remove_const_t<std::remove_reference_t<decltype(l()[0])>>, \
                  Size, \
                  std::integer_sequence<std::uint64_t, detail::key8<Indices>()...>, \
                  std::index_sequence<Indices...>>;

}; // namespace jm

#define XORSTRDEF(str) ::jm::xor_string([]() { return str; }, \
                     std::integral_constant<std::size_t, sizeof(str) / sizeof(*str)>{}, \
                     std::make_index_sequence<::jm::detail::_buffer_size<sizeof(str)>()>{})
#define XORSTR(str) XORSTRDEF(str).decrypt()

#endif // include guard

decode

#include <iostream>
#include "../MetaString4.h"

void mainstr() {
    auto temp = DEF_OBFUSCATED4("abc");
    printf("%s", temp.decrypt());
}

void mainstrw() {
    auto temp = DEF_OBFUSCATED4(L"abc");
    wprintf(L"%s", temp.decrypt());
}

int main()
{
    mainstr();
    mainstrw();
    return 0;
}
; Attributes: bp-based frame

; int __cdecl main()
_main proc near

var_48= dword ptr -48h
var_44= dword ptr -44h
var_40= xmmword ptr -40h
var_30= xmmword ptr -30h
var_20= xmmword ptr -20h
var_4= dword ptr -4

push    ebp
mov     ebp, esp
and     esp, 0FFFFFFF0h
sub     esp, 50h
mov     eax, ___security_cookie
xor     eax, esp
mov     [esp+50h+var_4], eax
mov     [esp+50h+var_48], 2F3191A4h
mov     [esp+50h+var_44], 0E0971183h
mov     eax, [esp+50h+var_48]
mov     ecx, [esp+50h+var_44]
mov     dword ptr [esp+50h+var_30], eax
mov     dword ptr [esp+50h+var_30+4], ecx
mov     [esp+50h+var_48], 0C9BF4616h
mov     [esp+50h+var_44], 52292A6Ah
mov     eax, [esp+50h+var_48]
mov     ecx, [esp+50h+var_44]
mov     dword ptr [esp+50h+var_30+8], eax
mov     dword ptr [esp+50h+var_30+0Ch], ecx
mov     [esp+50h+var_48], 2F52F3C5h
mov     [esp+50h+var_44], 0E0971183h
mov     eax, [esp+50h+var_48]
mov     ecx, [esp+50h+var_44]
mov     dword ptr [esp+50h+var_40], eax
mov     [esp+50h+var_48], 0C9BF4616h
mov     dword ptr [esp+50h+var_40+4], ecx
mov     [esp+50h+var_44], 52292A6Ah
mov     eax, [esp+50h+var_48]
mov     ecx, [esp+50h+var_44]
mov     dword ptr [esp+50h+var_40+8], eax
lea     eax, [esp+50h+var_30]
mov     dword ptr [esp+50h+var_40+0Ch], ecx
movaps  xmm1, [esp+50h+var_40]
pxor    xmm1, [esp+50h+var_30]
push    eax
push    offset _Format  ; "%s"
movaps  [esp+58h+var_30], xmm1
call    _printf
mov     [esp+58h+var_48], 2F30F3A4h
mov     [esp+58h+var_44], 0E09711E0h
mov     eax, [esp+58h+var_48]
mov     ecx, [esp+58h+var_44]
mov     dword ptr [esp+58h+var_20], eax
mov     dword ptr [esp+58h+var_20+4], ecx
mov     [esp+58h+var_48], 0C9BF4616h
mov     [esp+58h+var_44], 52292A6Ah
mov     eax, [esp+58h+var_48]
mov     ecx, [esp+58h+var_44]
mov     dword ptr [esp+58h+var_20+8], eax
mov     dword ptr [esp+58h+var_20+0Ch], ecx
mov     [esp+58h+var_48], 2F52F3C5h
mov     [esp+58h+var_44], 0E0971183h
mov     eax, [esp+58h+var_48]
mov     ecx, [esp+58h+var_44]
mov     dword ptr [esp+58h+var_40], eax
mov     [esp+58h+var_48], 0C9BF4616h
mov     dword ptr [esp+58h+var_40+4], ecx
mov     [esp+58h+var_44], 52292A6Ah
mov     eax, [esp+58h+var_48]
mov     ecx, [esp+58h+var_44]
mov     dword ptr [esp+58h+var_40+8], eax
lea     eax, [esp+58h+var_20]
mov     dword ptr [esp+58h+var_40+0Ch], ecx
movaps  xmm1, [esp+58h+var_40]
pxor    xmm1, [esp+58h+var_20]
push    eax
push    offset aS       ; "%s"
movaps  [esp+60h+var_20], xmm1
call    _wprintf
mov     ecx, [esp+60h+var_4]
add     esp, 10h
xor     ecx, esp        ; cookie
xor     eax, eax
call    @__security_check_cookie@4 ; __security_check_cookie(x)
mov     esp, ebp
pop     ebp
retn
_main endp

mainstr();

  • ESP,extended stack pointer
  • EBP,extended base pointer
C7 44 24 08 62 DE 0D 0E mov         dword ptr [esp+8],0E0DDE62h     # C7 44 24 xx ?? ?? ?? ??
C7 44 24 0C DA 71 8F 70 mov         dword ptr [esp+0Ch],708F71DAh   # C7 44 24 yy ?? ?? ?? ?? // yy = xx + 4
8B 44 24 08             mov         eax,dword ptr [esp+8]           # 8B 44 24 xx
8B 4C 24 0C             mov         ecx,dword ptr [esp+0Ch]         # 8B 4C 24 yy
89 44 24 20             mov         dword ptr [esp+20h],eax         # 89 44 24 aa // 首地址 [esp+20h]
89 4C 24 24             mov         dword ptr [esp+24h],ecx         # 89 4C 24 bb // bb = aa + 4 [esp+24h]

C7 44 24 08 68 E1 DB 44 mov         dword ptr [esp+8],44DBE168h     # C7 44 24 xx ?? ?? ?? ??
C7 44 24 0C 87 9A 82 9D mov         dword ptr [esp+0Ch],9D829A87h   # C7 44 24 yy ?? ?? ?? ?? // yy = xx + 4
8B 44 24 08             mov         eax,dword ptr [esp+8]           # 8B 44 24 xx
8B 4C 24 0C             mov         ecx,dword ptr [esp+0Ch]         # 8B 4C 24 yy
89 44 24 28             mov         dword ptr [esp+28h],eax         # 89 44 24 aa // 首地址 [esp+28h]
89 4C 24 2C             mov         dword ptr [esp+2Ch],ecx         # 89 4C 24 bb // bb = aa + 4 [esp+2Ch]

C7 44 24 08 03 BC 6E 0E mov         dword ptr [esp+8],0E6EBC03h     # C7 44 24 xx ?? ?? ?? ??
C7 44 24 0C DA 71 8F 70 mov         dword ptr [esp+0Ch],708F71DAh   # C7 44 24 yy ?? ?? ?? ?? // yy = xx + 4
8B 44 24 08             mov         eax,dword ptr [esp+8]           # 8B 44 24 xx
8B 4C 24 0C             mov         ecx,dword ptr [esp+0Ch]         # 8B 4C 24 yy
89 44 24 10             mov         dword ptr [esp+10h],eax         # 89 44 24 aa // 首地址 [esp+10h]
C7 44 24 08 68 E1 DB 44 mov         dword ptr [esp+8],44DBE168h     # C7 44 24 bb // bb = aa + 4 [esp+8]

89 4C 24 14             mov         dword ptr [esp+14h],ecx         # 89 4C 24 xx // 首地址 [esp+14h]
C7 44 24 0C 87 9A 82 9D mov         dword ptr [esp+0Ch],9D829A87h   # C7 44 24 yy ?? ?? ?? ??
8B 44 24 08             mov         eax,dword ptr [esp+8]           # 8B 44 24 xx
8B 4C 24 0C             mov         ecx,dword ptr [esp+0Ch]         # 8B 4C 24 yy
89 44 24 18             mov         dword ptr [esp+18h],eax         # 89 44 24 yy

8D 44 24 20             lea         eax,[esp+20h]                   # 8D 44 24 ??
89 4C 24 1C             mov         dword ptr [esp+1Ch],ecx         # 89 4C 24 ??

0F 28 4C 24 10          movaps      xmm1,xmmword ptr [esp+10h]      # 0F 28 4C 24 ??
66 0F EF 4C 24 20       pxor        xmm1,xmmword ptr [esp+20h]      # 66 0F EF 4C 24 ??

50                      push        eax
68 00 21 F9 00          push        offset string "%s" (0F92100h)
0F 29 4C 24 28          movaps      xmmword ptr [esp+28h],xmm1
E8 0D FF FF FF          call        printf (0F91050h)

mainstrw();

C7 44 24 10 62 BC 0C 0E mov         dword ptr [esp+10h],0E0CBC62h
C7 44 24 14 B9 71 8F 70 mov         dword ptr [esp+14h],708F71B9h
8B 44 24 10             mov         eax,dword ptr [esp+10h]
8B 4C 24 14             mov         ecx,dword ptr [esp+14h]
89 44 24 38             mov         dword ptr [esp+38h],eax
89 4C 24 3C             mov         dword ptr [esp+3Ch],ecx
C7 44 24 10 68 E1 DB 44 mov         dword ptr [esp+10h],44DBE168h
C7 44 24 14 87 9A 82 9D mov         dword ptr [esp+14h],9D829A87h
8B 44 24 10             mov         eax,dword ptr [esp+10h]
8B 4C 24 14             mov         ecx,dword ptr [esp+14h]
89 44 24 40             mov         dword ptr [esp+40h],eax
89 4C 24 44             mov         dword ptr [esp+44h],ecx
C7 44 24 10 03 BC 6E 0E mov         dword ptr [esp+10h],0E6EBC03h
C7 44 24 14 DA 71 8F 70 mov         dword ptr [esp+14h],708F71DAh
8B 44 24 10             mov         eax,dword ptr [esp+10h]
8B 4C 24 14             mov         ecx,dword ptr [esp+14h]
89 44 24 18             mov         dword ptr [esp+18h],eax
C7 44 24 10 68 E1 DB 44 mov         dword ptr [esp+10h],44DBE168h
89 4C 24 1C             mov         dword ptr [esp+1Ch],ecx
C7 44 24 14 87 9A 82 9D mov         dword ptr [esp+14h],9D829A87h
8B 44 24 10             mov         eax,dword ptr [esp+10h]
8B 4C 24 14             mov         ecx,dword ptr [esp+14h]
89 44 24 20             mov         dword ptr [esp+20h],eax
8D 44 24 38             lea         eax,[esp+38h]
89 4C 24 24             mov         dword ptr [esp+24h],ecx
0F 28 4C 24 18          movaps      xmm1,xmmword ptr [esp+18h]
66 0F EF 4C 24 38       pxor        xmm1,xmmword ptr [esp+38h]
50                      push        eax
68 04 21 F9 00          push        offset string L"%s" (0F92104h)
0F 29 4C 24 40          movaps      xmmword ptr [esp+40h],xmm1
E8 2E FE FF FF          call        wprintf (0F91010h)

IDA F5

int __cdecl main()
{
  __m128i v0; // ST20_16
  __m128i v2; // [esp+20h] [ebp-30h]
  __m128i v3; // [esp+30h] [ebp-20h]

  v2.m128i_i64[0] = 0xD413D0267133D9Ei64;
  v2.m128i_i64[1] = 0x624D270D286B3C6Ci64;
  v0.m128i_i64[0] = 0xD415A6402705FFFi64;
  v0.m128i_i64[1] = 0x624D270D286B3C6Ci64;
  v2 = _mm_xor_si128(v0, v2);
  printf("%s", &v2);
  v3.m128i_i64[0] = 0xD245A0702125F9Ei64;
  v3.m128i_i64[1] = 0x624D270D280C3C0Ai64;
  v0.m128i_i64[0] = 0xD415A6402705FFFi64;
  v0.m128i_i64[1] = 0x624D270D286B3C6Ci64;
  v3 = _mm_xor_si128(v0, v3);
  wprintf(L"%s", &v3);
  return 0;
}

参考资料快照
参考资料快照

本文短链接:
If you have any questions or feedback, please reach out .