From 259090c18bce2853c7cc536a98b8975f3586f13c Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 31 Jan 2025 22:39:40 -0600 Subject: [PATCH 01/51] Introduce ARM Neon SIMD. --- ext/json/ext/generator/extconf.rb | 19 ++ ext/json/ext/generator/generator.c | 287 ++++++++++++++++++++++++++++- ext/json/ext/generator/simd.h | 48 +++++ test/json/json_generator_test.rb | 48 ++++- 4 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 ext/json/ext/generator/simd.h diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf6..109a73a99 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,24 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + + if enable_config('generator-use-simd', default=true) + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + # Try to compile a small program using NEON instructions + if have_header('arm_neon.h') + have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + end + end + + create_header + create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 1bd6af6ed..f8744666c 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,8 @@ #include #include +#include "simd.h" + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -166,14 +168,36 @@ static const unsigned char script_safe_escape_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9, }; +#ifdef ENABLE_SIMD + +struct _simd_state { +#ifdef HAVE_SIMD_NEON + struct { + uint8x16x4_t escape_table[4]; + uint8x16x4_t script_safe_escape_table[4]; + } neon; +#endif /* HAVE_SIMD_NEON */ +}; + +static struct _simd_state simd_state; + +#endif /* ENABLE_SIMD */ typedef struct _search_state { const char *ptr; const char *end; const char *cursor; FBuffer *buffer; + +#ifdef ENABLE_SIMD + const char *returned_from; + unsigned char maybe_matches[16]; + unsigned long current_match_index; +#endif /* ENABLE_SIMD */ } search_state; +unsigned char (*search_escape_impl)(search_state *, const unsigned char escape_table[256]); + static inline void search_flush(search_state *search) { fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); @@ -208,6 +232,227 @@ static inline unsigned char search_escape(search_state *search, const unsigned c return 0; } +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON + +static inline unsigned char search_update_matches_neon_lut(search_state *search, uint8x16x4_t *tables) { + while (search->ptr + 16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + + uint8x16_t tmp1 = vqtbl4q_u8(tables[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(tables[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + + uint8x16_t result = vorrq_u8(tmp1, tmp2); + + // The top 128 bytes of the escape_table are all 0. + // TODO is this a safe to do? + if (tables == simd_state.neon.script_safe_escape_table) { + uint8x16_t tmp3 = vqtbl4q_u8(tables[2], veorq_u8(chunk, vdupq_n_u8(0x80))); + uint8x16_t tmp4 = vqtbl4q_u8(tables[3], veorq_u8(chunk, vdupq_n_u8(0xc0))); + result = vorrq_u8(result, vorrq_u8(tmp3, tmp4)); + } + + if (vmaxvq_u8(result) == 0) { + search->ptr += 16; + continue; + } + + vst1q_u8(search->maybe_matches, result); + return 1; + } + + return 0; +} + +static unsigned char search_update_matches_neon_rules(search_state *search, const unsigned char escape_table[256]) { + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + + if (escape_table == script_safe_escape_table) { + /* + * This works almost exactly the same as what is described above. The difference in this case comes after we know + * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, + * some bytes need to be handled differently. + * + * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, + * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine + * the results with a logical OR. + * + * Now we loop over the result vector and switch on the particular pattern we just created. If we find a + * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct + * action. + */ + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t forward_slash = vdupq_n_u8('/'); + + while (search->ptr+16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + + uint8x16_t needs_escape = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + needs_escape = vorrq_u8(needs_escape, has_escaped_char); + + if (vmaxvq_u8(needs_escape) == 0) { + search->ptr += 16; + continue; + } + + for(int i=0; i<16; i++) { + unsigned char ch = *(search->ptr+i); + search->maybe_matches[i] = escape_table[ch]; + } + + return 1; + } + } else { + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + while (search->ptr+16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + if (vmaxvq_u8(needs_escape) == 0) { + search->ptr += 16; + continue; + } + + for(int i=0; i<16; i++) { + unsigned char ch = *(search->ptr+i); + search->maybe_matches[i] = escape_table[ch]; + } + + return 1; + } + } + return 0; +} + +// TODO This can likely be made generic if we know the stride width of the vector. +static inline unsigned char search_return_next_match_neon(search_state *search) { + for(; search->current_match_index < 16 && search->ptr < search->end; ) { + unsigned char ch_len = search->maybe_matches[search->current_match_index]; + + if (RB_UNLIKELY(ch_len)) { + if (ch_len & ESCAPE_MASK) { + if (RB_UNLIKELY(ch_len == 11)) { + const unsigned char *uptr = (const unsigned char *)search->ptr; + if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { + search->ptr += 3; + search->current_match_index += 3; + continue; + } + } + search->returned_from = search->ptr; + search_flush(search); + return ch_len & CHAR_LENGTH_MASK; + } else { + search->ptr += ch_len; + search->current_match_index += ch_len; + } + } else { + search->ptr++; + search->current_match_index++; + } + } + return 0; +} + +// TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. +static inline unsigned char search_escape_neon(search_state *search, const unsigned char escape_table[256]) +{ + if (RB_UNLIKELY(search->returned_from != NULL)) { + search->current_match_index += (search->ptr - search->returned_from); + search->returned_from = NULL; + unsigned char ch_len = search_return_next_match_neon(search); + if (RB_UNLIKELY(ch_len)) { + return ch_len; + } + } + + uint8x16x4_t *tables; + if (escape_table == script_safe_escape_table) { + tables = simd_state.neon.script_safe_escape_table; + } else { + tables = simd_state.neon.escape_table; + } + + while (search->ptr + 16 < search->end) { + if (!search_update_matches_neon_lut(search, tables)) { + break; + } + + // if (!search_update_matches_neon_rules(search, escape_table)) { + // break; + // } + + search->current_match_index=0; + unsigned char ch_len = search_return_next_match_neon(search); + if (RB_UNLIKELY(ch_len)) { + return ch_len; + } + } + + if (search->ptr < search->end) { + return search_escape(search, escape_table); + } + + search_flush(search); + return 0; +} +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ + static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) { const unsigned char ch = (unsigned char)*search->ptr; switch (ch_len) { @@ -263,7 +508,7 @@ static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_ static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256]) { unsigned char ch_len; - while ((ch_len = search_escape(search, escape_table))) { + while ((ch_len = search_escape_impl(search, escape_table))) { fast_escape_UTF8_char(search, ch_len); } } @@ -929,6 +1174,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat search.cursor = search.ptr; search.end = search.ptr + len; +#ifdef ENABLE_SIMD + search.current_match_index = 0; + search.returned_from = NULL; +#endif /* ENABLE_SIMD */ + switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: @@ -1088,6 +1338,25 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) return Qundef; } +/* SIMD Utilities (if enabled) */ +#ifdef ENABLE_SIMD + +#ifdef HAVE_SIMD_NEON +static void initialize_simd_neon(void) { + simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table, 0); + simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table, 64); + simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table, 128); + simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table, 192); + + simd_state.neon.script_safe_escape_table[0] = load_uint8x16_4(script_safe_escape_table, 0); + simd_state.neon.script_safe_escape_table[1] = load_uint8x16_4(script_safe_escape_table, 64); + simd_state.neon.script_safe_escape_table[2] = load_uint8x16_4(script_safe_escape_table, 128); + simd_state.neon.script_safe_escape_table[3] = load_uint8x16_4(script_safe_escape_table, 192); +} +#endif /* HAVE_NEON_SIMD */ + +#endif + static VALUE cState_partial_generate(VALUE self, VALUE obj, generator_func func, VALUE io) { GET_STATE(self); @@ -1744,4 +2013,20 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); + + + switch(find_simd_implementation()) { +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + /* Initialize ARM Neon SIMD Implementation. */ + initialize_simd_neon(); + search_escape_impl = search_escape_neon; + break; +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ + default: + search_escape_impl = search_escape; + break; + } } diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 000000000..ba6e57b4b --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,48 @@ +#include "extconf.h" + +typedef enum { + SIMD_NONE, + SIMD_NEON, +} SIMD_Implementation; + +#ifdef ENABLE_SIMD + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) +#include + +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +SIMD_Implementation find_simd_implementation() { + return SIMD_NEON; +} + +#define HAVE_SIMD_NEON 1 + +uint8x16x4_t load_uint8x16_4(const unsigned char *table, int offset) { + uint8x16x4_t tab; + for(int i=0; i<4; i++) { + tab.val[i] = vld1q_u8(table+offset+(i*16)); + } + return tab; +} + +void print_uint8x16(char *msg, uint8x16_t vec) { + printf("%s\n[ ", msg); + uint8_t store[16] = {0}; + vst1q_u8(store, vec); + for(int i=0; i<16; i++) { + printf("%3d ", store[i]); + } + printf("]\n"); +} + +#endif /* ARM Neon Support.*/ + +/* Other SIMD implementation checks here. */ + +#endif /* ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif \ No newline at end of file diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index d97f0505f..f4621fa2b 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -427,18 +427,34 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # - data = [ '\\"' ] - json = '["\\\\\""]' + data = [ '\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # + data = [ '\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"' ] + json = '["\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\""]' assert_equal json, generate(data) # data = [ '/' ] json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -455,6 +471,10 @@ def test_backslash json = '["\""]' assert_equal json, generate(data) # + data = ['"""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) @@ -462,6 +482,30 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) + # + data = '\tThis is a test of the emergency broadcast system.' + json = "\"\\\\tThis is a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This\tis a test of the emergency broadcast system.' + json = "\"This\\\\tis a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is\ta test of the emergency broadcast system.' + json = "\"This is\\\\ta test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.\n' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" + assert_equal json, generate(data) end def test_string_subclass From 9ad196e996249fe4fb04e625059e54251646e64d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 4 Feb 2025 20:23:08 -0600 Subject: [PATCH 02/51] Use the 'rules' implementation instead of the lookup table implementation. Also store the potential matches directly rather than looking up values in the escape table. --- ext/json/ext/generator/generator.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index f8744666c..a6a2e5dfd 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -303,7 +303,7 @@ static unsigned char search_update_matches_neon_rules(search_state *search, cons search->ptr += 16; continue; } - + for(int i=0; i<16; i++) { unsigned char ch = *(search->ptr+i); search->maybe_matches[i] = escape_table[ch]; @@ -367,10 +367,8 @@ static unsigned char search_update_matches_neon_rules(search_state *search, cons continue; } - for(int i=0; i<16; i++) { - unsigned char ch = *(search->ptr+i); - search->maybe_matches[i] = escape_table[ch]; - } + uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); + vst1q_u8(search->maybe_matches, maybe_matches); return 1; } @@ -428,14 +426,14 @@ static inline unsigned char search_escape_neon(search_state *search, const unsig } while (search->ptr + 16 < search->end) { - if (!search_update_matches_neon_lut(search, tables)) { - break; - } - - // if (!search_update_matches_neon_rules(search, escape_table)) { + // if (!search_update_matches_neon_lut(search, tables)) { // break; // } + if (!search_update_matches_neon_rules(search, escape_table)) { + break; + } + search->current_match_index=0; unsigned char ch_len = search_return_next_match_neon(search); if (RB_UNLIKELY(ch_len)) { From d8a2e56f391c17e7a5f20b5aa34bcb599acc6243 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 4 Feb 2025 21:11:26 -0600 Subject: [PATCH 03/51] Refactoring and simplifications. --- ext/json/ext/generator/generator.c | 100 ++++++++++++++--------------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 75222a9fa..de9619397 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -253,12 +253,42 @@ static struct _simd_state simd_state; #ifdef ENABLE_SIMD #ifdef HAVE_SIMD_NEON -static inline unsigned char search_update_matches_neon_lut(search_state *search, uint8x16x4_t *tables) { +// TODO This can likely be made generic if we know the stride width of the vector. +static inline unsigned char search_escape_basic_neon_next_match(search_state *search) { + for(; search->current_match_index < 16 && search->ptr < search->end; ) { + unsigned char ch_len = search->maybe_matches[search->current_match_index]; + + if (RB_UNLIKELY(ch_len)) { + if (ch_len & ESCAPE_MASK) { + if (RB_UNLIKELY(ch_len == 11)) { + const unsigned char *uptr = (const unsigned char *)search->ptr; + if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { + search->ptr += 3; + search->current_match_index += 3; + continue; + } + } + search->returned_from = search->ptr; + search_flush(search); + return ch_len & CHAR_LENGTH_MASK; + } else { + search->ptr += ch_len; + search->current_match_index += ch_len; + } + } else { + search->ptr++; + search->current_match_index++; + } + } + return 0; +} + +static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr + 16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t tmp1 = vqtbl4q_u8(tables[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(tables[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table[1], veorq_u8(chunk, vdupq_n_u8(0x40))); uint8x16_t result = vorrq_u8(tmp1, tmp2); @@ -268,13 +298,15 @@ static inline unsigned char search_update_matches_neon_lut(search_state *search, } vst1q_u8(search->maybe_matches, result); - return 1; + + search->current_match_index=0; + return search_escape_basic_neon_next_match(search); } return 0; } -static unsigned char search_update_matches_neon_rules(search_state *search) { +static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); @@ -337,72 +369,34 @@ static unsigned char search_update_matches_neon_rules(search_state *search) { uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); vst1q_u8(search->maybe_matches, maybe_matches); - return 1; + search->current_match_index=0; + return search_escape_basic_neon_next_match(search); } return 0; } -// TODO This can likely be made generic if we know the stride width of the vector. -static inline unsigned char search_return_next_match_neon(search_state *search) { - for(; search->current_match_index < 16 && search->ptr < search->end; ) { - unsigned char ch_len = search->maybe_matches[search->current_match_index]; - - if (RB_UNLIKELY(ch_len)) { - if (ch_len & ESCAPE_MASK) { - if (RB_UNLIKELY(ch_len == 11)) { - const unsigned char *uptr = (const unsigned char *)search->ptr; - if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { - search->ptr += 3; - search->current_match_index += 3; - continue; - } - } - search->returned_from = search->ptr; - search_flush(search); - return ch_len & CHAR_LENGTH_MASK; - } else { - search->ptr += ch_len; - search->current_match_index += ch_len; - } - } else { - search->ptr++; - search->current_match_index++; - } - } - return 0; -} - // TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. static inline unsigned char search_escape_basic_neon(search_state *search) { if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - unsigned char ch_len = search_return_next_match_neon(search); + unsigned char ch_len = search_escape_basic_neon_next_match(search); if (RB_UNLIKELY(ch_len)) { return ch_len; } } - // uint8x16x4_t *tables = simd_state.neon.escape_table; - - while (search->ptr + 16 < search->end) { - // if (!search_update_matches_neon_lut(search, tables)) { - // break; - // } - - if (!search_update_matches_neon_rules(search)) { - break; - } - - search->current_match_index=0; - unsigned char ch_len = search_return_next_match_neon(search); - if (RB_UNLIKELY(ch_len)) { - return ch_len; - } + unsigned char ch_len; + if ((ch_len = search_escape_basic_neon_advance_lut(search)) != 0) { + return ch_len; } + // if ((ch_len = search_escape_basic_neon_advance_rules(search)) != 0) { + // return ch_len; + // } + if (search->ptr < search->end) { return search_escape_basic(search); } From 89ba0be1038fe3c0f8c20c0f0c53e0a597db31c7 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 5 Feb 2025 20:46:43 -0600 Subject: [PATCH 04/51] Load the SIMD lookup table explicitly without loops. --- ext/json/ext/generator/generator.c | 8 ++++---- ext/json/ext/generator/simd.h | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index de9619397..a33fa0335 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -1325,10 +1325,10 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) #ifdef HAVE_SIMD_NEON static void initialize_simd_neon(void) { - simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic, 0); - simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic, 64); - simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic, 128); - simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic, 192); + simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic); + simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic+64); + simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic+128); + simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic+192); } #endif /* HAVE_NEON_SIMD */ diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index ba6e57b4b..11332ee15 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -17,11 +17,12 @@ SIMD_Implementation find_simd_implementation() { #define HAVE_SIMD_NEON 1 -uint8x16x4_t load_uint8x16_4(const unsigned char *table, int offset) { +uint8x16x4_t load_uint8x16_4(const unsigned char *table) { uint8x16x4_t tab; - for(int i=0; i<4; i++) { - tab.val[i] = vld1q_u8(table+offset+(i*16)); - } + tab.val[0] = vld1q_u8(table); + tab.val[1] = vld1q_u8(table+16); + tab.val[2] = vld1q_u8(table+32); + tab.val[3] = vld1q_u8(table+48); return tab; } From a23b84e1da0277ca5cb6819853cf37ea66a5cc8f Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 5 Feb 2025 21:05:20 -0600 Subject: [PATCH 05/51] Use only 2 64-byte lookup tables for the neon escape_table_basic as we only need 128 bytes for the lookup table as the top 128 bytes are all zeros. --- ext/json/ext/generator/generator.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a33fa0335..0525975b4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -241,7 +241,7 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) struct _simd_state { #ifdef HAVE_SIMD_NEON struct { - uint8x16x4_t escape_table[4]; + uint8x16x4_t escape_table_basic[2]; } neon; #endif /* HAVE_SIMD_NEON */ }; @@ -287,8 +287,8 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s while (search->ptr + 16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); uint8x16_t result = vorrq_u8(tmp1, tmp2); @@ -1325,10 +1325,8 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) #ifdef HAVE_SIMD_NEON static void initialize_simd_neon(void) { - simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic); - simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic+64); - simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic+128); - simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic+192); + simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); + simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); } #endif /* HAVE_NEON_SIMD */ From 5506091c7c961eebfa08879af1bc23c28061b213 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 9 Feb 2025 20:45:03 -0600 Subject: [PATCH 06/51] Simplifications. --- ext/json/ext/generator/generator.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 0525975b4..d70654c2b 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -259,22 +259,9 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se unsigned char ch_len = search->maybe_matches[search->current_match_index]; if (RB_UNLIKELY(ch_len)) { - if (ch_len & ESCAPE_MASK) { - if (RB_UNLIKELY(ch_len == 11)) { - const unsigned char *uptr = (const unsigned char *)search->ptr; - if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { - search->ptr += 3; - search->current_match_index += 3; - continue; - } - } - search->returned_from = search->ptr; - search_flush(search); - return ch_len & CHAR_LENGTH_MASK; - } else { - search->ptr += ch_len; - search->current_match_index += ch_len; - } + search->returned_from = search->ptr; + search_flush(search); + return 1; } else { search->ptr++; search->current_match_index++; From 3ae56773b3068ee1709aab08071aba1a05bda72d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 9 Feb 2025 21:08:33 -0600 Subject: [PATCH 07/51] A few more cleanups. --- ext/json/ext/generator/generator.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d70654c2b..a91b2f69d 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -369,19 +369,17 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - unsigned char ch_len = search_escape_basic_neon_next_match(search); - if (RB_UNLIKELY(ch_len)) { - return ch_len; + if (RB_UNLIKELY(search_escape_basic_neon_next_match(search))) { + return 1; } } - unsigned char ch_len; - if ((ch_len = search_escape_basic_neon_advance_lut(search)) != 0) { - return ch_len; + if (search_escape_basic_neon_advance_lut(search)) { + return 1; } - // if ((ch_len = search_escape_basic_neon_advance_rules(search)) != 0) { - // return ch_len; + // if (search_escape_basic_neon_advance_rules(search)) { + // return 1; // } if (search->ptr < search->end) { From 332107dbf40634faa0bc5aeeda3375f3f17dbd97 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 23 Mar 2025 21:38:59 -0400 Subject: [PATCH 08/51] Use SIMD for fewer than 16 characters (but at least 8) remaining. --- ext/json/ext/generator/generator.c | 108 ++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 17 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a91b2f69d..6b0488667 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -120,8 +120,10 @@ typedef struct _search_state { static inline void search_flush(search_state *search) { - fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); - search->cursor = search->ptr; + if (search->cursor < search->ptr) { + fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); + search->cursor = search->ptr; + } } static const unsigned char escape_table_basic[256] = { @@ -270,14 +272,19 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se return 0; } -static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { - while (search->ptr + 16 < search->end) { - uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); +static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); - uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t result = vorrq_u8(tmp1, tmp2); + return result; +} - uint8x16_t result = vorrq_u8(tmp1, tmp2); + +static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { + while (search->ptr + 16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t result = neon_lut_update(chunk); if (vmaxvq_u8(result) == 0) { search->ptr += 16; @@ -290,14 +297,52 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s return search_escape_basic_neon_next_match(search); } + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t result = neon_lut_update(chunk); + if (vmaxvq_u8(result) == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + return 0; } -static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { +static inline uint8x16_t neon_rules_update(uint8x16_t chunk) { const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + return needs_escape; +} + +static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { /* * The code below implements an SIMD-based algorithm to determine if N bytes at a time * need to be escaped. @@ -343,10 +388,7 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search */ while (search->ptr+16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { search->ptr += 16; @@ -360,6 +402,35 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search return search_escape_basic_neon_next_match(search); } + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t result = neon_rules_update(chunk); + if (vmaxvq_u8(result) == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + return 0; } @@ -374,14 +445,17 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } } - if (search_escape_basic_neon_advance_lut(search)) { - return 1; - } + // TODO Pick an implementation or make them configurable. Right now it looks like the "rules" based approach + // might be a bit faster. - // if (search_escape_basic_neon_advance_rules(search)) { + // if (search_escape_basic_neon_advance_lut(search)) { // return 1; // } + if (search_escape_basic_neon_advance_rules(search)) { + return 1; + } + if (search->ptr < search->end) { return search_escape_basic(search); } From a47ffa02cf53d9b2b08b60a381d1cfda24ad13fa Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 4 Apr 2025 22:05:33 -0500 Subject: [PATCH 09/51] Add x86-64 SSE2 support with runtime detection. --- ext/json/ext/generator/extconf.rb | 12 +++ ext/json/ext/generator/generator.c | 135 ++++++++++++++++++++++++++--- ext/json/ext/generator/simd.h | 44 +++++++++- 3 files changed, 178 insertions(+), 13 deletions(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 109a73a99..65f87434b 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -21,6 +21,18 @@ $defs.push("-DENABLE_SIMD") end end + + if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-msse2') + #include + int main() { + __m128i test = _mm_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + + have_header('cpuid.h') end create_header diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 6b0488667..d2a69c23a 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -115,6 +115,7 @@ typedef struct _search_state { const char *returned_from; unsigned char maybe_matches[16]; unsigned long current_match_index; + unsigned long maybe_match_length; #endif /* ENABLE_SIMD */ } search_state; @@ -240,24 +241,22 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) #ifdef ENABLE_SIMD -struct _simd_state { #ifdef HAVE_SIMD_NEON +struct _simd_state { + struct { uint8x16x4_t escape_table_basic[2]; } neon; -#endif /* HAVE_SIMD_NEON */ }; static struct _simd_state simd_state; - +#endif /* HAVE_SIMD_NEON */ #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD -#ifdef HAVE_SIMD_NEON - // TODO This can likely be made generic if we know the stride width of the vector. -static inline unsigned char search_escape_basic_neon_next_match(search_state *search) { - for(; search->current_match_index < 16 && search->ptr < search->end; ) { +static inline unsigned char search_escape_basic_simd_next_match(search_state *search) { + for(; search->current_match_index < search->maybe_match_length && search->ptr < search->end; ) { unsigned char ch_len = search->maybe_matches[search->current_match_index]; if (RB_UNLIKELY(ch_len)) { @@ -272,6 +271,8 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se return 0; } +#ifdef HAVE_SIMD_NEON + static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); @@ -293,8 +294,9 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s vst1q_u8(search->maybe_matches, result); - search->current_match_index=0; - return search_escape_basic_neon_next_match(search); + search->current_match_index = 0; + search->maybe_match_length = sizeof(uint8x16_t); + return search_escape_basic_simd_next_match(search); } // There are fewer than 16 bytes left. @@ -398,8 +400,9 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); vst1q_u8(search->maybe_matches, maybe_matches); - search->current_match_index=0; - return search_escape_basic_neon_next_match(search); + search->current_match_index = 0; + search->maybe_match_length = sizeof(uint8x16_t); + return search_escape_basic_simd_next_match(search); } // There are fewer than 16 bytes left. @@ -440,7 +443,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - if (RB_UNLIKELY(search_escape_basic_neon_next_match(search))) { + if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { return 1; } } @@ -464,6 +467,109 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } #endif /* HAVE_SIMD_NEON */ + +#ifdef HAVE_SIMD_SSE2 + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("sse2") +#endif /* __GNUC__ */ + +#ifdef __clang__ +__attribute__((target("sse2"))) +#endif /* __clang__ */ +static unsigned char search_escape_basic_sse2(search_state *search) { + if (RB_UNLIKELY(search->returned_from != NULL)) { + search->current_match_index += (search->ptr - search->returned_from); + search->returned_from = NULL; + if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { + return 1; + } + } + + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + + while (search->ptr+sizeof(__m128i) < search->end) { + __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + search->ptr += sizeof(__m128i); + continue; + } + + __m128i nines = _mm_set1_epi8(9); + __m128i maybe_matches = _mm_and_si128(needs_escape, nines); + + _mm_storeu_si128((__m128i *)search->maybe_matches, maybe_matches); + + search->current_match_index = 0; + search->maybe_match_length = sizeof(__m128i); + return search_escape_basic_simd_next_match(search); + } + + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + __m128i chunk = _mm_loadu_si128((__m128i const *) s); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} + +#ifdef __GNUC__ +#pragma GCC reset_options +#endif /* __GNUC__ */ + +#endif /* HAVE_SIMD_SSE2 */ + #endif /* ENABLE_SIMD */ static const unsigned char script_safe_escape_table[256] = { @@ -2058,6 +2164,11 @@ void Init_generator(void) search_escape_basic_impl = search_escape_basic_neon; break; #endif /* HAVE_SIMD_NEON */ +#ifdef HAVE_SIMD_SSE2 + case SIMD_SSE2: + search_escape_basic_impl = search_escape_basic_sse2; + break; +#endif /* HAVE_SIMD_SSE2 */ #endif /* ENABLE_SIMD */ default: search_escape_basic_impl = search_escape_basic; diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 11332ee15..73100723d 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -3,6 +3,7 @@ typedef enum { SIMD_NONE, SIMD_NEON, + SIMD_SSE2 } SIMD_Implementation; #ifdef ENABLE_SIMD @@ -38,7 +39,48 @@ void print_uint8x16(char *msg, uint8x16_t vec) { #endif /* ARM Neon Support.*/ -/* Other SIMD implementation checks here. */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + +#ifdef HAVE_X86INTRIN_H +#include + +#define HAVE_SIMD_SSE2 1 + +void print_m128i(const char *prefix, __m128i vec) { + uint8_t r[16]; + _mm_storeu_si128((__m128i *) r, vec); + + printf("%s = [ ", prefix); + for(int i=0; i<16; i++) { + printf("%02x ", r[i]); + } + printf("]\n"); +} + +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 + +#include +#endif /* HAVE_CPUID_H */ + +SIMD_Implementation find_simd_implementation(void) { + +#if defined(__GNUC__ ) || defined(__clang__) +#ifdef __GNUC__ + __builtin_cpu_init(); +#endif /* __GNUC__ */ + + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse2")) { + return SIMD_SSE2; + } +#endif /* __GNUC__ || __clang__*/ + + return SIMD_NONE; +} + +#endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ #endif /* ENABLE_SIMD */ From b2cab3380fac7cee92f37b7785490d4eab5b1ddb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 10:59:36 -0500 Subject: [PATCH 10/51] Simplified the SSE2 implementation. --- ext/json/ext/generator/generator.c | 53 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d2a69c23a..9a0473aaf 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -470,16 +470,40 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #ifdef HAVE_SIMD_SSE2 -#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) -#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) -#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) -#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) +// #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +// #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +// #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +// #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) #ifdef __GNUC__ #pragma GCC push_options #pragma GCC target ("sse2") #endif /* __GNUC__ */ +#ifdef __clang__ +__attribute__((target("sse2"))) +#endif /* __clang__ */ +static inline __m128i sse2_update(__m128i chunk) { + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + const __m128i high_bit = _mm_set1_epi8(0x80); + + // __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + + // This is a signed comparison. We need special handling for bytes > 127. + __m128i too_low = _mm_cmplt_epi8(chunk, lower_bound); + + // Determine which bytes have the high bit set and remove them from 'too_low'. + __m128i high_bit_set = _mm_cmpeq_epi8(_mm_and_si128(chunk, high_bit), high_bit); + too_low = _mm_andnot_si128(high_bit_set, too_low); + + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + return needs_escape; +} + #ifdef __clang__ __attribute__((target("sse2"))) #endif /* __clang__ */ @@ -492,16 +516,9 @@ static unsigned char search_escape_basic_sse2(search_state *search) { } } - const __m128i lower_bound = _mm_set1_epi8(' '); - const __m128i backslash = _mm_set1_epi8('\\'); - const __m128i dblquote = _mm_set1_epi8('\"'); - while (search->ptr+sizeof(__m128i) < search->end) { __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); - __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); - __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); - __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + __m128i needs_escape = sse2_update(chunk); int needs_escape_mask = _mm_movemask_epi8(needs_escape); @@ -510,17 +527,14 @@ static unsigned char search_escape_basic_sse2(search_state *search) { continue; } - __m128i nines = _mm_set1_epi8(9); - __m128i maybe_matches = _mm_and_si128(needs_escape, nines); - - _mm_storeu_si128((__m128i *)search->maybe_matches, maybe_matches); + // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + _mm_storeu_si128((__m128i *)search->maybe_matches, needs_escape); search->current_match_index = 0; search->maybe_match_length = sizeof(__m128i); return search_escape_basic_simd_next_match(search); } - // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= 8) { @@ -539,10 +553,7 @@ static unsigned char search_escape_basic_sse2(search_state *search) { memcpy(s, search->ptr, remaining); __m128i chunk = _mm_loadu_si128((__m128i const *) s); - __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); - __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); - __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + __m128i needs_escape = sse2_update(chunk); int needs_escape_mask = _mm_movemask_epi8(needs_escape); From 5cd7b5e8034e6dab1381f41c56326b600ecbc1bb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 11:09:05 -0500 Subject: [PATCH 11/51] A small simplification to the ARM Neon implementation. --- ext/json/ext/generator/generator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 9a0473aaf..c6fd527e4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -397,8 +397,8 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search continue; } - uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); - vst1q_u8(search->maybe_matches, maybe_matches); + // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + vst1q_u8(search->maybe_matches, needs_escape); search->current_match_index = 0; search->maybe_match_length = sizeof(uint8x16_t); From 1d00db9967c66b7730c4a3a87bbadd5a271d1690 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 11:20:59 -0500 Subject: [PATCH 12/51] More cleanups. --- ext/json/ext/generator/generator.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index c6fd527e4..d40af4cc9 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -254,7 +254,7 @@ static struct _simd_state simd_state; #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD -// TODO This can likely be made generic if we know the stride width of the vector. + static inline unsigned char search_escape_basic_simd_next_match(search_state *search) { for(; search->current_match_index < search->maybe_match_length && search->ptr < search->end; ) { unsigned char ch_len = search->maybe_matches[search->current_match_index]; @@ -283,12 +283,12 @@ static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { - while (search->ptr + 16 < search->end) { + while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t result = neon_lut_update(chunk); if (vmaxvq_u8(result) == 0) { - search->ptr += 16; + search->ptr += sizeof(uint8x16_t); continue; } @@ -306,11 +306,11 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(uint8x16_t)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(uint8x16_t)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. @@ -388,16 +388,16 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search * To determine how to escape characters, we look at each value in the needs_escape vector and take * the appropriate action. */ - while (search->ptr+16 < search->end) { + while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { - search->ptr += 16; + search->ptr += sizeof(uint8x16_t); continue; } - // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. vst1q_u8(search->maybe_matches, needs_escape); search->current_match_index = 0; @@ -412,11 +412,11 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(uint8x16_t)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(uint8x16_t)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. @@ -437,7 +437,6 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search return 0; } -// TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. static inline unsigned char search_escape_basic_neon(search_state *search) { if (RB_UNLIKELY(search->returned_from != NULL)) { @@ -527,7 +526,7 @@ static unsigned char search_escape_basic_sse2(search_state *search) { continue; } - // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. _mm_storeu_si128((__m128i *)search->maybe_matches, needs_escape); search->current_match_index = 0; @@ -542,11 +541,11 @@ static unsigned char search_escape_basic_sse2(search_state *search) { search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(__m128i)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(__m128i)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. From 475925429b22a27d2f7adf2b62fc92fba5cb7f6b Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 6 Apr 2025 11:43:54 -0500 Subject: [PATCH 13/51] Neon: Use a mask to locate the characters that need to be escaped instead of iterating through the chunk one byte/result at a time. --- ext/json/ext/generator/generator.c | 74 ++++++++++++++++++++++-------- ext/json/ext/generator/simd.h | 26 +++++++++++ 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d40af4cc9..972595353 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -114,6 +114,13 @@ typedef struct _search_state { #ifdef ENABLE_SIMD const char *returned_from; unsigned char maybe_matches[16]; + +#ifdef HAVE_SIMD_NEON + uint64_t matches_mask; + const char *chunk_base; + uint8_t has_matches; +#endif /* HAVE_SIMD_NEON */ + unsigned long current_match_index; unsigned long maybe_match_length; #endif /* ENABLE_SIMD */ @@ -273,15 +280,40 @@ static inline unsigned char search_escape_basic_simd_next_match(search_state *se #ifdef HAVE_SIMD_NEON +static inline unsigned char neon_mask_next_match(search_state *search) { + uint64_t mask = search->matches_mask; + if (mask > 0) { + uint32_t index = trailing_zeros(mask) >> 2; + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; + } + return 0; +} + +// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon +static inline uint64_t neon_match_mask(uint8x16_t matches) { + const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); + const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); + return mask & 0x8888888888888888ull; +} + static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); - uint8x16_t result = vorrq_u8(tmp1, tmp2); return result; } - static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); @@ -292,11 +324,10 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s continue; } - vst1q_u8(search->maybe_matches, result); - - search->current_match_index = 0; - search->maybe_match_length = sizeof(uint8x16_t); - return search_escape_basic_simd_next_match(search); + search->matches_mask = neon_match_mask(vceqq_u8(result, vdupq_n_u8(9))); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_mask_next_match(search); } // There are fewer than 16 bytes left. @@ -396,13 +427,11 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search search->ptr += sizeof(uint8x16_t); continue; } - - // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. - vst1q_u8(search->maybe_matches, needs_escape); - search->current_match_index = 0; - search->maybe_match_length = sizeof(uint8x16_t); - return search_escape_basic_simd_next_match(search); + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_mask_next_match(search); } // There are fewer than 16 bytes left. @@ -439,11 +468,17 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search static inline unsigned char search_escape_basic_neon(search_state *search) { - if (RB_UNLIKELY(search->returned_from != NULL)) { - search->current_match_index += (search->ptr - search->returned_from); - search->returned_from = NULL; - if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { - return 1; + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + if (RB_LIKELY(neon_mask_next_match(search))) { + return 1; + } + } else { + // neon_mask_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + search->ptr = search->chunk_base+sizeof(uint8x16_t); } } @@ -1331,7 +1366,8 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat #ifdef ENABLE_SIMD search.current_match_index = 0; - search.returned_from = NULL; + search.matches_mask = 0; + search.has_matches = 0; #endif /* ENABLE_SIMD */ switch(rb_enc_str_coderange(obj)) { diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 73100723d..15fcb1ede 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -8,6 +8,32 @@ typedef enum { #ifdef ENABLE_SIMD +#ifdef __clang__ + #if __has_builtin(__builtin_ctzll) + #define HAVE_BUILTIN_CTZLL 1 + #else + #define HAVE_BUILTIN_CTZLL 0 + #endif +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define HAVE_BUILTIN_CTZLL 1 +#else + #define HAVE_BUILTIN_CTZLL 0 +#endif + +static inline uint32_t trailing_zeros(uint64_t input) { +#if HAVE_BUILTIN_CTZLL + return __builtin_ctzll(input); +#else + uint32_t trailing_zeros = 0; + uint64_t temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; +#endif +} + #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include From 045115a3ffc161b835e0850ad1b9b7d75a3661c1 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 6 Apr 2025 20:39:31 -0500 Subject: [PATCH 14/51] Make the Neon implementation configurable based on a build parameter. --- ext/json/ext/generator/extconf.rb | 4 ++++ ext/json/ext/generator/generator.c | 25 ++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 65f87434b..71ba695d9 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -19,6 +19,10 @@ } SRC $defs.push("-DENABLE_SIMD") + + if enable_config('generator-use-neon-lut', default=false) + $defs.push('-DUSE_NEON_LUT') + end end end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 972595353..a8b00f2a1 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -249,6 +249,7 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) #ifdef ENABLE_SIMD #ifdef HAVE_SIMD_NEON +#ifdef USE_NEON_LUT struct _simd_state { struct { @@ -257,6 +258,7 @@ struct _simd_state { }; static struct _simd_state simd_state; +#endif /* USE_NEON_LUT */ #endif /* HAVE_SIMD_NEON */ #endif /* ENABLE_SIMD */ @@ -307,6 +309,7 @@ static inline uint64_t neon_match_mask(uint8x16_t matches) { return mask & 0x8888888888888888ull; } +#ifdef USE_NEON_LUT static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); @@ -362,6 +365,8 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s return 0; } +#else + static inline uint8x16_t neon_rules_update(uint8x16_t chunk) { const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); @@ -465,6 +470,7 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search return 0; } +#endif /* USE_NEON_LUT */ static inline unsigned char search_escape_basic_neon(search_state *search) { @@ -481,18 +487,15 @@ static inline unsigned char search_escape_basic_neon(search_state *search) search->ptr = search->chunk_base+sizeof(uint8x16_t); } } - - // TODO Pick an implementation or make them configurable. Right now it looks like the "rules" based approach - // might be a bit faster. - - // if (search_escape_basic_neon_advance_lut(search)) { - // return 1; - // } - +#ifdef USE_NEON_LUT + if (search_escape_basic_neon_advance_lut(search)) { + return 1; + } +#else if (search_escape_basic_neon_advance_rules(search)) { return 1; } - +#endif /* USE_NEON_LUT */ if (search->ptr < search->end) { return search_escape_basic(search); } @@ -1535,10 +1538,12 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) #ifdef ENABLE_SIMD #ifdef HAVE_SIMD_NEON +#ifdef USE_NEON_LUT static void initialize_simd_neon(void) { simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); } +#endif /* USE_NEON_LUT */ #endif /* HAVE_NEON_SIMD */ #endif @@ -2206,7 +2211,9 @@ void Init_generator(void) #ifdef HAVE_SIMD_NEON case SIMD_NEON: /* Initialize ARM Neon SIMD Implementation. */ +#ifdef USE_NEON_LUT initialize_simd_neon(); +#endif /* USE_NEON_LUT */ search_escape_basic_impl = search_escape_basic_neon; break; #endif /* HAVE_SIMD_NEON */ From 13b2c4ff6a619c303b1d96cb5f7b4302e9ed1a0b Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 6 Apr 2025 20:49:36 -0500 Subject: [PATCH 15/51] fix: ensure code builds correctly on x86 after changing the neon implementation. --- ext/json/ext/generator/generator.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a8b00f2a1..c772cbab6 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -1369,8 +1369,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat #ifdef ENABLE_SIMD search.current_match_index = 0; + search.returned_from = NULL; +#ifdef HAVE_NEON_SIMD search.matches_mask = 0; search.has_matches = 0; +#endif /* HAVE_NEON_SIMD */ #endif /* ENABLE_SIMD */ switch(rb_enc_str_coderange(obj)) { From d4f5bf7e5ad48cebbe4822128af5cb1ef144b7e9 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 6 Apr 2025 21:19:24 -0500 Subject: [PATCH 16/51] Use a maches mask to determine the location of the maching characters in the SSE2 implementation. --- ext/json/ext/generator/generator.c | 93 +++++++++++++++--------------- ext/json/ext/generator/simd.h | 16 ++++- 2 files changed, 63 insertions(+), 46 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index c772cbab6..7b9d7923b 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -112,17 +112,16 @@ typedef struct _search_state { FBuffer *buffer; #ifdef ENABLE_SIMD - const char *returned_from; - unsigned char maybe_matches[16]; + const char *chunk_base; + uint8_t has_matches; #ifdef HAVE_SIMD_NEON uint64_t matches_mask; - const char *chunk_base; - uint8_t has_matches; +#elif HAVE_SIMD_SSE2 + uint16_t matches_mask; +#else +#error "Unknown SIMD Implementation." #endif /* HAVE_SIMD_NEON */ - - unsigned long current_match_index; - unsigned long maybe_match_length; #endif /* ENABLE_SIMD */ } search_state; @@ -263,29 +262,12 @@ static struct _simd_state simd_state; #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD - -static inline unsigned char search_escape_basic_simd_next_match(search_state *search) { - for(; search->current_match_index < search->maybe_match_length && search->ptr < search->end; ) { - unsigned char ch_len = search->maybe_matches[search->current_match_index]; - - if (RB_UNLIKELY(ch_len)) { - search->returned_from = search->ptr; - search_flush(search); - return 1; - } else { - search->ptr++; - search->current_match_index++; - } - } - return 0; -} - #ifdef HAVE_SIMD_NEON -static inline unsigned char neon_mask_next_match(search_state *search) { +static inline unsigned char neon_next_match(search_state *search) { uint64_t mask = search->matches_mask; if (mask > 0) { - uint32_t index = trailing_zeros(mask) >> 2; + uint32_t index = trailing_zeros64(mask) >> 2; // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. // If we want to use a similar approach for full escaping we'll need to ensure: @@ -330,7 +312,7 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s search->matches_mask = neon_match_mask(vceqq_u8(result, vdupq_n_u8(9))); search->has_matches = 1; search->chunk_base = search->ptr; - return neon_mask_next_match(search); + return neon_next_match(search); } // There are fewer than 16 bytes left. @@ -436,7 +418,7 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; - return neon_mask_next_match(search); + return neon_next_match(search); } // There are fewer than 16 bytes left. @@ -477,11 +459,11 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. if (search->matches_mask > 0) { - if (RB_LIKELY(neon_mask_next_match(search))) { + if (RB_LIKELY(neon_next_match(search))) { return 1; } } else { - // neon_mask_next_match will only advance search->ptr up to the last matching character. + // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; search->ptr = search->chunk_base+sizeof(uint8x16_t); @@ -512,6 +494,26 @@ static inline unsigned char search_escape_basic_neon(search_state *search) // #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) // #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) +static inline unsigned char sse2_next_match(search_state *search) { + int mask = search->matches_mask; + if (mask > 0) { + int index = trailing_zeros(mask); + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; + } + return 0; +} + #ifdef __GNUC__ #pragma GCC push_options #pragma GCC target ("sse2") @@ -545,11 +547,17 @@ static inline __m128i sse2_update(__m128i chunk) { __attribute__((target("sse2"))) #endif /* __clang__ */ static unsigned char search_escape_basic_sse2(search_state *search) { - if (RB_UNLIKELY(search->returned_from != NULL)) { - search->current_match_index += (search->ptr - search->returned_from); - search->returned_from = NULL; - if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { - return 1; + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + if (RB_LIKELY(sse2_next_match(search))) { + return 1; + } + } else { + // sse2_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + search->ptr = search->chunk_base+sizeof(__m128i); } } @@ -564,12 +572,10 @@ static unsigned char search_escape_basic_sse2(search_state *search) { continue; } - // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. - _mm_storeu_si128((__m128i *)search->maybe_matches, needs_escape); - - search->current_match_index = 0; - search->maybe_match_length = sizeof(__m128i); - return search_escape_basic_simd_next_match(search); + search->has_matches = 1; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); } // There are fewer than 16 bytes left. @@ -1368,12 +1374,9 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat search.end = search.ptr + len; #ifdef ENABLE_SIMD - search.current_match_index = 0; - search.returned_from = NULL; -#ifdef HAVE_NEON_SIMD search.matches_mask = 0; search.has_matches = 0; -#endif /* HAVE_NEON_SIMD */ + search.chunk_base = NULL; #endif /* ENABLE_SIMD */ switch(rb_enc_str_coderange(obj)) { diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 15fcb1ede..f58848176 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -20,7 +20,7 @@ typedef enum { #define HAVE_BUILTIN_CTZLL 0 #endif -static inline uint32_t trailing_zeros(uint64_t input) { +static inline uint32_t trailing_zeros64(uint64_t input) { #if HAVE_BUILTIN_CTZLL return __builtin_ctzll(input); #else @@ -34,6 +34,20 @@ static inline uint32_t trailing_zeros(uint64_t input) { #endif } +static inline int trailing_zeros(int input) { + #if HAVE_BUILTIN_CTZLL + return __builtin_ctz(input); + #else + int trailing_zeros = 0; + int temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; + #endif + } + #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include From be7456c7d3ca12a33571a19df7110ffbf31dc9ac Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Mon, 7 Apr 2025 09:05:53 -0500 Subject: [PATCH 17/51] Fix a build issue on ruby 2.7 for SSE2 support. --- ext/json/ext/generator/extconf.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 71ba695d9..ad5739577 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -26,7 +26,7 @@ end end - if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-msse2') + if have_header('x86intrin.h') && have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-msse2') #include int main() { __m128i test = _mm_set1_epi8(32); From 49702550af16ad475b78da476af07b5767c14cb7 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Mon, 7 Apr 2025 09:08:28 -0500 Subject: [PATCH 18/51] PR Feedback. --- ext/json/ext/generator/generator.c | 2 +- ext/json/ext/generator/simd.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 7b9d7923b..4848d9983 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -118,7 +118,7 @@ typedef struct _search_state { #ifdef HAVE_SIMD_NEON uint64_t matches_mask; #elif HAVE_SIMD_SSE2 - uint16_t matches_mask; + int matches_mask; #else #error "Unknown SIMD Implementation." #endif /* HAVE_SIMD_NEON */ diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index f58848176..751b9d81c 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -52,7 +52,7 @@ static inline int trailing_zeros(int input) { #include #define FIND_SIMD_IMPLEMENTATION_DEFINED 1 -SIMD_Implementation find_simd_implementation() { +static SIMD_Implementation find_simd_implementation() { return SIMD_NEON; } @@ -103,7 +103,7 @@ void print_m128i(const char *prefix, __m128i vec) { #include #endif /* HAVE_CPUID_H */ -SIMD_Implementation find_simd_implementation(void) { +static SIMD_Implementation find_simd_implementation(void) { #if defined(__GNUC__ ) || defined(__clang__) #ifdef __GNUC__ @@ -125,7 +125,7 @@ SIMD_Implementation find_simd_implementation(void) { #endif /* ENABLE_SIMD */ #ifndef FIND_SIMD_IMPLEMENTATION_DEFINED -SIMD_Implementation find_simd_implementation(void) { +static SIMD_Implementation find_simd_implementation(void) { return SIMD_NONE; } #endif \ No newline at end of file From 1c6ee3dadf627bb7941e5d52f559a698f377741d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Mon, 7 Apr 2025 09:27:20 -0500 Subject: [PATCH 19/51] A few tweaks to the SSE algorithm. --- ext/json/ext/generator/generator.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 4848d9983..99a8c5aad 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -489,10 +489,10 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #ifdef HAVE_SIMD_SSE2 -// #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) -// #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) -// #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) -// #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) static inline unsigned char sse2_next_match(search_state *search) { int mask = search->matches_mask; @@ -526,16 +526,16 @@ static inline __m128i sse2_update(__m128i chunk) { const __m128i lower_bound = _mm_set1_epi8(' '); const __m128i backslash = _mm_set1_epi8('\\'); const __m128i dblquote = _mm_set1_epi8('\"'); - const __m128i high_bit = _mm_set1_epi8(0x80); + // const __m128i high_bit = _mm_set1_epi8(0x80); - // __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - // This is a signed comparison. We need special handling for bytes > 127. - __m128i too_low = _mm_cmplt_epi8(chunk, lower_bound); + // // This is a signed comparison. We need special handling for bytes > 127. + // __m128i too_low = _mm_cmplt_epi8(chunk, lower_bound); - // Determine which bytes have the high bit set and remove them from 'too_low'. - __m128i high_bit_set = _mm_cmpeq_epi8(_mm_and_si128(chunk, high_bit), high_bit); - too_low = _mm_andnot_si128(high_bit_set, too_low); + // // Determine which bytes have the high bit set and remove them from 'too_low'. + // __m128i high_bit_set = _mm_cmpeq_epi8(_mm_and_si128(chunk, high_bit), high_bit); + // too_low = _mm_andnot_si128(high_bit_set, too_low); __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); From b7b120bbf6787279639d91c9b27bb0cb5416c1d8 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 8 Apr 2025 07:50:38 -0500 Subject: [PATCH 20/51] Changed the '<' comparison to '<=' in the SIMD loop iterating through the string. --- ext/json/ext/generator/generator.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 99a8c5aad..52276f335 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -300,7 +300,7 @@ static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { } static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { - while (search->ptr+sizeof(uint8x16_t) < search->end) { + while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t result = neon_lut_update(chunk); @@ -406,8 +406,9 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search * To determine how to escape characters, we look at each value in the needs_escape vector and take * the appropriate action. */ - while (search->ptr+sizeof(uint8x16_t) < search->end) { + while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { @@ -561,7 +562,7 @@ static unsigned char search_escape_basic_sse2(search_state *search) { } } - while (search->ptr+sizeof(__m128i) < search->end) { + while (search->ptr+sizeof(__m128i) <= search->end) { __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); __m128i needs_escape = sse2_update(chunk); From e5c5e7cd8818acdd841ef828e38eaa30dc66622d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 8 Apr 2025 21:03:36 -0500 Subject: [PATCH 21/51] Make the search_escape_basic_impl function pointer static. --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 52276f335..b8377e329 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -146,7 +146,7 @@ static const unsigned char escape_table_basic[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -unsigned char (*search_escape_basic_impl)(search_state *); +static unsigned char (*search_escape_basic_impl)(search_state *); static inline unsigned char search_escape_basic(search_state *search) { From 062587e8211e3305ffb22812e02e7d1d4f3966ff Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 9 Apr 2025 08:41:46 -0500 Subject: [PATCH 22/51] Ensure all search_escape_basic* functions are inlined. --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index b8377e329..b548563bf 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -547,7 +547,7 @@ static inline __m128i sse2_update(__m128i chunk) { #ifdef __clang__ __attribute__((target("sse2"))) #endif /* __clang__ */ -static unsigned char search_escape_basic_sse2(search_state *search) { +static inline unsigned char search_escape_basic_sse2(search_state *search) { if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. if (search->matches_mask > 0) { From f49af9badf0c5afd44c7d33a2cfe99793d7f8495 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 9 Apr 2025 08:59:25 -0500 Subject: [PATCH 23/51] Refactor the code that copies the last remaining characters in the SIMD-fallback case to a method. --- ext/json/ext/generator/generator.c | 67 +++++++++++------------------- 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index b548563bf..50097f0a8 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -262,6 +262,25 @@ static struct _simd_state simd_state; #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD + +static inline char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) { + // Flush the buffer so everything up until the last 'len' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, vec_len); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', len); + + // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, len); + + return s; +} + #ifdef HAVE_SIMD_NEON static inline unsigned char neon_next_match(search_state *search) { @@ -318,26 +337,14 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= 8) { - // Flush the buffer so everything up until the last 'remaining' characters are unflushed. - search_flush(search); - - FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, sizeof(uint8x16_t)); - - char *s = (buf->ptr + buf->len); - - memset(s, 'X', sizeof(uint8x16_t)); - - // Optimistically copy the remaining characters to the output FBuffer. If there are no characters - // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. - memcpy(s, search->ptr, remaining); + char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); uint8x16_t chunk = vld1q_u8((const unsigned char *) s); uint8x16_t result = neon_lut_update(chunk); if (vmaxvq_u8(result) == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. - buf->len += remaining; + search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; @@ -425,26 +432,14 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= 8) { - // Flush the buffer so everything up until the last 'remaining' characters are unflushed. - search_flush(search); - - FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, sizeof(uint8x16_t)); - - char *s = (buf->ptr + buf->len); - - memset(s, 'X', sizeof(uint8x16_t)); - - // Optimistically copy the remaining characters to the output FBuffer. If there are no characters - // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. - memcpy(s, search->ptr, remaining); + char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); uint8x16_t chunk = vld1q_u8((const unsigned char *) s); uint8x16_t result = neon_rules_update(chunk); if (vmaxvq_u8(result) == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. - buf->len += remaining; + search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; @@ -582,19 +577,7 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) { // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= 8) { - // Flush the buffer so everything up until the last 'remaining' characters are unflushed. - search_flush(search); - - FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, sizeof(__m128i)); - - char *s = (buf->ptr + buf->len); - - memset(s, 'X', sizeof(__m128i)); - - // Optimistically copy the remaining characters to the output FBuffer. If there are no characters - // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. - memcpy(s, search->ptr, remaining); + char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining); __m128i chunk = _mm_loadu_si128((__m128i const *) s); __m128i needs_escape = sse2_update(chunk); @@ -604,7 +587,7 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) { if (needs_escape_mask == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. - buf->len += remaining; + search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; From 15f1887390c278b62d035626f31bd91b73680b11 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 9 Apr 2025 09:30:53 -0500 Subject: [PATCH 24/51] Change 'len' to 'vec_len' to ensure bytes past 'len' do not need to be escaped. --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 50097f0a8..b57a65d79 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -272,7 +272,7 @@ static inline char *copy_remaining_bytes(search_state *search, unsigned long vec char *s = (buf->ptr + buf->len); - memset(s, 'X', len); + memset(s, 'X', vec_len); // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. From a666f5a2fba360b5a27fa19c9043143e1d34dbcb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 9 Apr 2025 21:51:55 -0500 Subject: [PATCH 25/51] Added the ability to use the matches_mask in the case there isn't a full vector's width worth of data remaining. --- ext/json/ext/generator/generator.c | 36 +++++++++++++++++++++--------- ext/json/ext/generator/simd.h | 4 +++- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index b57a65d79..a920a0067 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -321,14 +321,14 @@ static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t result = neon_lut_update(chunk); + uint8x16_t needs_escape = neon_lut_update(chunk); - if (vmaxvq_u8(result) == 0) { + if (vmaxvq_u8(needs_escape) == 0) { search->ptr += sizeof(uint8x16_t); continue; } - search->matches_mask = neon_match_mask(vceqq_u8(result, vdupq_n_u8(9))); + search->matches_mask = neon_match_mask(vceqq_u8(needs_escape, vdupq_n_u8(9))); search->has_matches = 1; search->chunk_base = search->ptr; return neon_next_match(search); @@ -336,18 +336,23 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); - if (remaining >= 8) { + if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); uint8x16_t chunk = vld1q_u8((const unsigned char *) s); - uint8x16_t result = neon_lut_update(chunk); - if (vmaxvq_u8(result) == 0) { + uint8x16_t needs_escape = neon_lut_update(chunk); + if (vmaxvq_u8(needs_escape) == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; + } else { + search->matches_mask = neon_match_mask(vceqq_u8(needs_escape, vdupq_n_u8(9))); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); } } @@ -431,18 +436,23 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); - if (remaining >= 8) { + if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); uint8x16_t chunk = vld1q_u8((const unsigned char *) s); - uint8x16_t result = neon_rules_update(chunk); - if (vmaxvq_u8(result) == 0) { + uint8x16_t needs_escape = neon_rules_update(chunk); + if (vmaxvq_u8(needs_escape) == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; + } else { + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); } } @@ -462,7 +472,11 @@ static inline unsigned char search_escape_basic_neon(search_state *search) // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; - search->ptr = search->chunk_base+sizeof(uint8x16_t); + if (RB_UNLIKELY(search->chunk_base+sizeof(uint8x16_t) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base+sizeof(uint8x16_t); + } } } #ifdef USE_NEON_LUT @@ -576,7 +590,7 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) { // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); - if (remaining >= 8) { + if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining); __m128i chunk = _mm_loadu_si128((__m128i const *) s); diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 751b9d81c..85716ab69 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -46,7 +46,9 @@ static inline int trailing_zeros(int input) { } return trailing_zeros; #endif - } +} + +#define SIMD_MINIMUM_THRESHOLD 6 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include From 1dc47f888464aed44532020aad8fd5bd3822b77c Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 9 Apr 2025 22:17:54 -0500 Subject: [PATCH 26/51] SSE implementation of using the escape mask when there isn't a full vector's width worth of data. --- ext/json/ext/generator/generator.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a920a0067..65cff7f4c 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -567,7 +567,11 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) { // sse2_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; - search->ptr = search->chunk_base+sizeof(__m128i); + if (RB_UNLIKELY(search->chunk_base+sizeof(__m128i) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base+sizeof(__m128i); + } } } @@ -605,6 +609,11 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) { search->ptr = search->end; search->cursor = search->end; return 0; + } else { + search->has_matches = 1; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); } } From af822fce16cb7b76f807ed84bb834fa9d311e963 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 16 Apr 2025 21:36:31 -0500 Subject: [PATCH 27/51] Optimizations, comments and formatting. Still work in progress. --- ext/json/ext/generator/generator.c | 104 ++++++++++++++++++----------- 1 file changed, 66 insertions(+), 38 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 65cff7f4c..7f4cf13c2 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -125,9 +125,21 @@ typedef struct _search_state { #endif /* ENABLE_SIMD */ } search_state; -static inline void search_flush(search_state *search) +#if (defined(__GNUC__ ) || defined(__clang__)) +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +static inline FORCE_INLINE void search_flush(search_state *search) { - if (search->cursor < search->ptr) { + // Do not remove this conditional without profiling, specfically escape-heavy text. + // escape_UTF8_char_basic will advance search->ptr and search->cursor (effectively a search_flush). + // For back-to-back characters that need to be escaped, specifcally for the SIMD code paths, this method + // will be called just before calling escape_UTF8_char_basic. There will be no characers to append for the + // consecutive characters that need to be escaped. While the fbuffer_append is a no-op if + // nothing needs to be flushed, we can save a few memory references with this conditional. + if (search->ptr > search->cursor) { fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); search->cursor = search->ptr; } @@ -162,7 +174,8 @@ static inline unsigned char search_escape_basic(search_state *search) return 0; } -static inline void escape_UTF8_char_basic(search_state *search) { +static inline void escape_UTF8_char_basic(search_state *search) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch) { case '"': fbuffer_append(search->buffer, "\\\"", 2); break; @@ -209,7 +222,8 @@ static inline void convert_UTF8_to_JSON(search_state *search) } } -static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) { +static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch_len) { case 1: { @@ -263,7 +277,8 @@ static struct _simd_state simd_state; #ifdef ENABLE_SIMD -static inline char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) { +static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) +{ // Flush the buffer so everything up until the last 'len' characters are unflushed. search_flush(search); @@ -276,49 +291,50 @@ static inline char *copy_remaining_bytes(search_state *search, unsigned long vec // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. - memcpy(s, search->ptr, len); + MEMCPY(s, search->ptr, char, len); return s; } #ifdef HAVE_SIMD_NEON -static inline unsigned char neon_next_match(search_state *search) { +static inline FORCE_INLINE unsigned char neon_next_match(search_state *search) +{ uint64_t mask = search->matches_mask; - if (mask > 0) { - uint32_t index = trailing_zeros64(mask) >> 2; - - // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. - // If we want to use a similar approach for full escaping we'll need to ensure: - // search->chunk_base + index >= search->ptr - // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match - // is one byte after the previous match then: - // search->chunk_base + index == search->ptr - search->ptr = search->chunk_base + index; - mask &= mask - 1; - search->matches_mask = mask; - search_flush(search); - return 1; - } - return 0; + uint32_t index = trailing_zeros64(mask) >> 2; + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; } // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon -static inline uint64_t neon_match_mask(uint8x16_t matches) { +static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) +{ const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); return mask & 0x8888888888888888ull; } #ifdef USE_NEON_LUT -static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { +static inline FORCE_INLINE uint8x16_t neon_lut_update(uint8x16_t chunk) +{ uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); uint8x16_t result = vorrq_u8(tmp1, tmp2); return result; } -static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { +static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_lut(search_state *search) +{ while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_lut_update(chunk); @@ -328,7 +344,7 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s continue; } - search->matches_mask = neon_match_mask(vceqq_u8(needs_escape, vdupq_n_u8(9))); + search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; return neon_next_match(search); @@ -349,7 +365,7 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s search->cursor = search->end; return 0; } else { - search->matches_mask = neon_match_mask(vceqq_u8(needs_escape, vdupq_n_u8(9))); + search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; return neon_next_match(search); @@ -361,7 +377,8 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s #else -static inline uint8x16_t neon_rules_update(uint8x16_t chunk) { +static inline FORCE_INLINE uint8x16_t neon_rules_update(uint8x16_t chunk) +{ const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); @@ -374,7 +391,8 @@ static inline uint8x16_t neon_rules_update(uint8x16_t chunk) { return needs_escape; } -static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { +static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules(search_state *search) +{ /* * The code below implements an SIMD-based algorithm to determine if N bytes at a time * need to be escaped. @@ -420,7 +438,6 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search */ while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { @@ -465,9 +482,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. if (search->matches_mask > 0) { - if (RB_LIKELY(neon_next_match(search))) { - return 1; - } + return neon_next_match(search); } else { // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. @@ -479,6 +494,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } } } + #ifdef USE_NEON_LUT if (search_escape_basic_neon_advance_lut(search)) { return 1; @@ -504,7 +520,8 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) -static inline unsigned char sse2_next_match(search_state *search) { +static inline unsigned char sse2_next_match(search_state *search) +{ int mask = search->matches_mask; if (mask > 0) { int index = trailing_zeros(mask); @@ -532,7 +549,8 @@ static inline unsigned char sse2_next_match(search_state *search) { #ifdef __clang__ __attribute__((target("sse2"))) #endif /* __clang__ */ -static inline __m128i sse2_update(__m128i chunk) { +static inline __m128i sse2_update(__m128i chunk) +{ const __m128i lower_bound = _mm_set1_epi8(' '); const __m128i backslash = _mm_set1_epi8('\\'); const __m128i dblquote = _mm_set1_epi8('\"'); @@ -556,7 +574,8 @@ static inline __m128i sse2_update(__m128i chunk) { #ifdef __clang__ __attribute__((target("sse2"))) #endif /* __clang__ */ -static inline unsigned char search_escape_basic_sse2(search_state *search) { +static inline unsigned char search_escape_basic_sse2(search_state *search) +{ if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. if (search->matches_mask > 0) { @@ -1549,12 +1568,21 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) /* SIMD Utilities (if enabled) */ #ifdef ENABLE_SIMD - #ifdef HAVE_SIMD_NEON #ifdef USE_NEON_LUT static void initialize_simd_neon(void) { simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); + + simd_state.neon.escape_table_basic[0].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[0], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[1], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[2], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[0].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[3], vdupq_n_u8(9)); + + simd_state.neon.escape_table_basic[1].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[0], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[1], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[2], vdupq_n_u8(9)); + simd_state.neon.escape_table_basic[1].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[3], vdupq_n_u8(9)); } #endif /* USE_NEON_LUT */ #endif /* HAVE_NEON_SIMD */ From ad995fcf8044777e577a384e2f9a2e90a4eaa42d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 17 Apr 2025 21:00:20 -0500 Subject: [PATCH 28/51] Implemented optimizations in the SSE2 implemenation. A few simplifications too. --- ext/json/ext/generator/generator.c | 67 ++++++++++-------------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 7f4cf13c2..ed5221f2b 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -520,68 +520,49 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) -static inline unsigned char sse2_next_match(search_state *search) +static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) { int mask = search->matches_mask; - if (mask > 0) { - int index = trailing_zeros(mask); - - // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. - // If we want to use a similar approach for full escaping we'll need to ensure: - // search->chunk_base + index >= search->ptr - // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match - // is one byte after the previous match then: - // search->chunk_base + index == search->ptr - search->ptr = search->chunk_base + index; - mask &= mask - 1; - search->matches_mask = mask; - search_flush(search); - return 1; - } - return 0; + int index = trailing_zeros(mask); + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; } -#ifdef __GNUC__ -#pragma GCC push_options -#pragma GCC target ("sse2") -#endif /* __GNUC__ */ +#if defined(__clang__) || defined(__GNUC__) +#define TARGET_SSE2 __attribute__((target("sse2"))) +#else +#define TARGET_SSE2 +#endif -#ifdef __clang__ -__attribute__((target("sse2"))) -#endif /* __clang__ */ -static inline __m128i sse2_update(__m128i chunk) +static inline TARGET_SSE2 FORCE_INLINE __m128i sse2_update(__m128i chunk) { const __m128i lower_bound = _mm_set1_epi8(' '); const __m128i backslash = _mm_set1_epi8('\\'); const __m128i dblquote = _mm_set1_epi8('\"'); - // const __m128i high_bit = _mm_set1_epi8(0x80); __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - - // // This is a signed comparison. We need special handling for bytes > 127. - // __m128i too_low = _mm_cmplt_epi8(chunk, lower_bound); - - // // Determine which bytes have the high bit set and remove them from 'too_low'. - // __m128i high_bit_set = _mm_cmpeq_epi8(_mm_and_si128(chunk, high_bit), high_bit); - // too_low = _mm_andnot_si128(high_bit_set, too_low); - __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); return needs_escape; } -#ifdef __clang__ -__attribute__((target("sse2"))) -#endif /* __clang__ */ -static inline unsigned char search_escape_basic_sse2(search_state *search) +static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search) { if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. if (search->matches_mask > 0) { - if (RB_LIKELY(sse2_next_match(search))) { - return 1; - } + return sse2_next_match(search); } else { // sse2_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. @@ -644,10 +625,6 @@ static inline unsigned char search_escape_basic_sse2(search_state *search) return 0; } -#ifdef __GNUC__ -#pragma GCC reset_options -#endif /* __GNUC__ */ - #endif /* HAVE_SIMD_SSE2 */ #endif /* ENABLE_SIMD */ From 9cf63a15c67551a79519ceb46b5be6f6a1d3331b Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 19 Apr 2025 21:34:46 -0500 Subject: [PATCH 29/51] Updates to better handle escape-heavy workloads on ARM Neon. --- ext/json/ext/generator/generator.c | 75 ++++++++++++++++++++---------- ext/json/ext/generator/simd.h | 14 ++++++ 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index ed5221f2b..d8ab87782 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -217,8 +217,11 @@ static inline void escape_UTF8_char_basic(search_state *search) */ static inline void convert_UTF8_to_JSON(search_state *search) { - while (search_escape_basic_impl(search)) { - escape_UTF8_char_basic(search); + unsigned char num_chars = 0; + while ((num_chars = search_escape_basic_impl(search))) { + do { + escape_UTF8_char_basic(search); + } while (--num_chars); } } @@ -336,14 +339,19 @@ static inline FORCE_INLINE uint8x16_t neon_lut_update(uint8x16_t chunk) static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr+sizeof(uint8x16_t) <= search->end) { - uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_lut_update(chunk); - - if (vmaxvq_u8(needs_escape) == 0) { + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { search->ptr += sizeof(uint8x16_t); continue; } + if (popcnt >= (int) sizeof(uint8x16_t)/2) { + return sizeof(uint8x16_t); + } + search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; @@ -355,21 +363,27 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_lut(se if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); - uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); uint8x16_t needs_escape = neon_lut_update(chunk); - if (vmaxvq_u8(needs_escape) == 0) { + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; - } else { - search->matches_mask = neon_match_mask(needs_escape); - search->has_matches = 1; - search->chunk_base = search->ptr; - return neon_next_match(search); } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return remaining; + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); } return 0; @@ -439,11 +453,16 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules( while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_rules_update(chunk); + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); - if (vmaxvq_u8(needs_escape) == 0) { + if (popcnt == 0) { search->ptr += sizeof(uint8x16_t); continue; } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return sizeof(uint8x16_t); + } search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; @@ -456,21 +475,27 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules( if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); - uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); uint8x16_t needs_escape = neon_rules_update(chunk); - if (vmaxvq_u8(needs_escape) == 0) { + uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + + if (popcnt == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. search->buffer->len += remaining; search->ptr = search->end; search->cursor = search->end; return 0; - } else { - search->matches_mask = neon_match_mask(needs_escape); - search->has_matches = 1; - search->chunk_base = search->ptr; - return neon_next_match(search); } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return remaining; + } + + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_next_match(search); } return 0; @@ -496,12 +521,14 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } #ifdef USE_NEON_LUT - if (search_escape_basic_neon_advance_lut(search)) { - return 1; + unsigned char num_chars = 0; + if ((num_chars = search_escape_basic_neon_advance_lut(search))) { + return num_chars; } #else - if (search_escape_basic_neon_advance_rules(search)) { - return 1; + unsigned char num_chars = 0; + if ((num_chars = search_escape_basic_neon_advance_rules(search))) { + return num_chars; } #endif /* USE_NEON_LUT */ if (search->ptr < search->end) { diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 85716ab69..ca3e40bc3 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -48,6 +48,20 @@ static inline int trailing_zeros(int input) { #endif } +uint32_t popcount32(uint32_t x) { + #if defined(__GNUC__) || defined(__clang__) + return __builtin_popcount(x); + #elif defined(__ARM_NEON) + #include + return vaddv_u8(vcnt_u8(vcreate_u8((uint64_t)x))) & 0xFF; + #else + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0F0F0F0F; + return (x * 0x01010101) >> 24; + #endif +} + #define SIMD_MINIMUM_THRESHOLD 6 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) From df76269b46eb79f13c6abe05b3e0a43cbcabe757 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 19 Apr 2025 21:42:50 -0500 Subject: [PATCH 30/51] Apply the same optimizations to the SSE2 implementation. --- ext/json/ext/generator/generator.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d8ab87782..d143cde81 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -613,6 +613,10 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se continue; } + if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { + return sizeof(__m128i); + } + search->has_matches = 1; search->matches_mask = needs_escape_mask; search->chunk_base = search->ptr; @@ -636,12 +640,16 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se search->ptr = search->end; search->cursor = search->end; return 0; - } else { - search->has_matches = 1; - search->matches_mask = needs_escape_mask; - search->chunk_base = search->ptr; - return sse2_next_match(search); } + + if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { + return remaining; + } + + search->has_matches = 1; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); } if (search->ptr < search->end) { From 769c0ace1ec3cf3ef8a213507bfec19e49ef70fb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 23 Apr 2025 07:16:49 -0500 Subject: [PATCH 31/51] WIP --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 0f1fff11e..493a28544 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -175,7 +175,7 @@ static inline unsigned char search_escape_basic(search_state *search) return 0; } -static inline void escape_UTF8_char_basic(search_state *search) +static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) { const unsigned char ch = (unsigned char)*search->ptr; switch (ch) { From 3686c5e8690c8222ca25ec71d28f7bb6ce2cfe68 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 23 Apr 2025 20:17:02 -0500 Subject: [PATCH 32/51] Remove the lookup table ARM Neon implementation. It was every so slightly slower than the rules based implementation. --- ext/json/ext/generator/extconf.rb | 4 - ext/json/ext/generator/generator.c | 173 ++++------------------------- 2 files changed, 23 insertions(+), 154 deletions(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index ad5739577..2ebac48b1 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -19,10 +19,6 @@ } SRC $defs.push("-DENABLE_SIMD") - - if enable_config('generator-use-neon-lut', default=false) - $defs.push('-DUSE_NEON_LUT') - end end end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 493a28544..f94ebd7de 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -265,22 +265,6 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) #ifdef ENABLE_SIMD -#ifdef HAVE_SIMD_NEON -#ifdef USE_NEON_LUT -struct _simd_state { - - struct { - uint8x16x4_t escape_table_basic[2]; - } neon; -}; - -static struct _simd_state simd_state; -#endif /* USE_NEON_LUT */ -#endif /* HAVE_SIMD_NEON */ -#endif /* ENABLE_SIMD */ - -#ifdef ENABLE_SIMD - static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) { // Flush the buffer so everything up until the last 'len' characters are unflushed. @@ -328,70 +312,6 @@ static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) return mask & 0x8888888888888888ull; } -#ifdef USE_NEON_LUT -static inline FORCE_INLINE uint8x16_t neon_lut_update(uint8x16_t chunk) -{ - uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); - uint8x16_t result = vorrq_u8(tmp1, tmp2); - return result; -} - -static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_lut(search_state *search) -{ - while (search->ptr+sizeof(uint8x16_t) <= search->end) { - uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t needs_escape = neon_lut_update(chunk); - uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); - - if (popcnt == 0) { - search->ptr += sizeof(uint8x16_t); - continue; - } - - if (popcnt >= (int) sizeof(uint8x16_t)/2) { - return sizeof(uint8x16_t); - } - - search->matches_mask = neon_match_mask(needs_escape); - search->has_matches = 1; - search->chunk_base = search->ptr; - return neon_next_match(search); - } - - // There are fewer than 16 bytes left. - unsigned long remaining = (search->end - search->ptr); - if (remaining >= SIMD_MINIMUM_THRESHOLD) { - char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); - - uint8x16_t chunk = vld1q_u8((const unsigned char *) s); - uint8x16_t needs_escape = neon_lut_update(chunk); - uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); - - if (popcnt == 0) { - // Nothing to escape, ensure search_flush doesn't do anything by setting - // search->cursor to search->ptr. - search->buffer->len += remaining; - search->ptr = search->end; - search->cursor = search->end; - return 0; - } - - if (popcnt >= sizeof(uint8x16_t)/2) { - return remaining; - } - - search->matches_mask = neon_match_mask(needs_escape); - search->has_matches = 1; - search->chunk_base = search->ptr; - return neon_next_match(search); - } - - return 0; -} - -#else - static inline FORCE_INLINE uint8x16_t neon_rules_update(uint8x16_t chunk) { const uint8x16_t lower_bound = vdupq_n_u8(' '); @@ -406,8 +326,24 @@ static inline FORCE_INLINE uint8x16_t neon_rules_update(uint8x16_t chunk) return needs_escape; } -static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules(search_state *search) +static inline unsigned char search_escape_basic_neon(search_state *search) { + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + return neon_next_match(search); + } else { + // neon_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + if (RB_UNLIKELY(search->chunk_base+sizeof(uint8x16_t) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base+sizeof(uint8x16_t); + } + } + } + /* * The code below implements an SIMD-based algorithm to determine if N bytes at a time * need to be escaped. @@ -441,15 +377,12 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules( * This is the needs_escape vector and it is equal to: * [FF 0 0 FF 0 0 0 0] * - * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of - * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be - * escaped. - * - * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and - * we advance pos by the width of the vector. + * Next we compute the bitwise AND between each byte and 0x1 and compute the horizontal sum of + * the values in the vector. This computes how many bytes need to be escaped within this chunk. * - * To determine how to escape characters, we look at each value in the needs_escape vector and take - * the appropriate action. + * If the sum is zero, no bytes need to be escaped and we can skip 16 bytes. + * + * If the sum is greater than or equal to 8, then we can assume that at least half of the bytes in chunk. */ while (search->ptr+sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); @@ -470,7 +403,7 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules( search->chunk_base = search->ptr; return neon_next_match(search); } - + // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= SIMD_MINIMUM_THRESHOLD) { @@ -499,39 +432,6 @@ static inline FORCE_INLINE unsigned char search_escape_basic_neon_advance_rules( return neon_next_match(search); } - return 0; -} -#endif /* USE_NEON_LUT */ - -static inline unsigned char search_escape_basic_neon(search_state *search) -{ - if (RB_UNLIKELY(search->has_matches)) { - // There are more matches if search->matches_mask > 0. - if (search->matches_mask > 0) { - return neon_next_match(search); - } else { - // neon_next_match will only advance search->ptr up to the last matching character. - // Skip over any characters in the last chunk that occur after the last match. - search->has_matches = 0; - if (RB_UNLIKELY(search->chunk_base+sizeof(uint8x16_t) >= search->end)) { - search->ptr = search->end; - } else { - search->ptr = search->chunk_base+sizeof(uint8x16_t); - } - } - } - -#ifdef USE_NEON_LUT - unsigned char num_chars = 0; - if ((num_chars = search_escape_basic_neon_advance_lut(search))) { - return num_chars; - } -#else - unsigned char num_chars = 0; - if ((num_chars = search_escape_basic_neon_advance_rules(search))) { - return num_chars; - } -#endif /* USE_NEON_LUT */ if (search->ptr < search->end) { return search_escape_basic(search); } @@ -1625,29 +1525,6 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) return Qundef; } -/* SIMD Utilities (if enabled) */ -#ifdef ENABLE_SIMD -#ifdef HAVE_SIMD_NEON -#ifdef USE_NEON_LUT -static void initialize_simd_neon(void) { - simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); - simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); - - simd_state.neon.escape_table_basic[0].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[0], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[0].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[1], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[0].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[2], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[0].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[0].val[3], vdupq_n_u8(9)); - - simd_state.neon.escape_table_basic[1].val[0] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[0], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[1].val[1] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[1], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[1].val[2] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[2], vdupq_n_u8(9)); - simd_state.neon.escape_table_basic[1].val[3] = vceqq_u8(simd_state.neon.escape_table_basic[1].val[3], vdupq_n_u8(9)); -} -#endif /* USE_NEON_LUT */ -#endif /* HAVE_NEON_SIMD */ - -#endif - static VALUE cState_partial_generate(VALUE self, VALUE obj, generator_func func, VALUE io) { GET_STATE(self); @@ -2310,10 +2187,6 @@ void Init_generator(void) #ifdef ENABLE_SIMD #ifdef HAVE_SIMD_NEON case SIMD_NEON: - /* Initialize ARM Neon SIMD Implementation. */ -#ifdef USE_NEON_LUT - initialize_simd_neon(); -#endif /* USE_NEON_LUT */ search_escape_basic_impl = search_escape_basic_neon; break; #endif /* HAVE_SIMD_NEON */ From e8df77a76e3502031453db8710e4d678c37daa17 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 23 Apr 2025 20:20:12 -0500 Subject: [PATCH 33/51] Fix a compiler warning on gcc. --- ext/json/ext/generator/simd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index ca3e40bc3..d59adec33 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -68,7 +68,7 @@ uint32_t popcount32(uint32_t x) { #include #define FIND_SIMD_IMPLEMENTATION_DEFINED 1 -static SIMD_Implementation find_simd_implementation() { +static SIMD_Implementation find_simd_implementation(void) { return SIMD_NEON; } From 91769511b5c7d252eb5ab13a13f1257b7775abbe Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 24 Apr 2025 06:51:24 -0500 Subject: [PATCH 34/51] Remove the print_* functions. --- ext/json/ext/generator/simd.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index d59adec33..0472c033c 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -83,16 +83,6 @@ uint8x16x4_t load_uint8x16_4(const unsigned char *table) { return tab; } -void print_uint8x16(char *msg, uint8x16_t vec) { - printf("%s\n[ ", msg); - uint8_t store[16] = {0}; - vst1q_u8(store, vec); - for(int i=0; i<16; i++) { - printf("%3d ", store[i]); - } - printf("]\n"); -} - #endif /* ARM Neon Support.*/ #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) @@ -102,17 +92,6 @@ void print_uint8x16(char *msg, uint8x16_t vec) { #define HAVE_SIMD_SSE2 1 -void print_m128i(const char *prefix, __m128i vec) { - uint8_t r[16]; - _mm_storeu_si128((__m128i *) r, vec); - - printf("%s = [ ", prefix); - for(int i=0; i<16; i++) { - printf("%02x ", r[i]); - } - printf("]\n"); -} - #ifdef HAVE_CPUID_H #define FIND_SIMD_IMPLEMENTATION_DEFINED 1 From 543db7b678c43f9b0f76bfdce26cd46dc9529e15 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 24 Apr 2025 21:41:27 -0500 Subject: [PATCH 35/51] Added a few tests. --- test/json/json_generator_test.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index b4abcc477..f86ce9aeb 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -489,6 +489,12 @@ def test_backslash data = 'This is a test of the emergency broadcast\tsystem.\n' json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" assert_equal json, generate(data) + data = '"' * 15 + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\"" + assert_equal json, generate(data) + data = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"a" + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"a\"" + assert_equal json, generate(data) end def test_string_subclass From c47751eb519ddd402094b853478cd8b589db2f0d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 24 Apr 2025 21:43:54 -0500 Subject: [PATCH 36/51] Fixed an issue where the code was escaping characters that didn't need to be escaped. --- ext/json/ext/generator/generator.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index f94ebd7de..03ec66957 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -218,11 +218,8 @@ static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) */ static inline void convert_UTF8_to_JSON(search_state *search) { - unsigned char num_chars = 0; - while ((num_chars = search_escape_basic_impl(search))) { - do { - escape_UTF8_char_basic(search); - } while (--num_chars); + while (search_escape_basic_impl(search)) { + escape_UTF8_char_basic(search); } } @@ -393,10 +390,6 @@ static inline unsigned char search_escape_basic_neon(search_state *search) search->ptr += sizeof(uint8x16_t); continue; } - - if (popcnt >= sizeof(uint8x16_t)/2) { - return sizeof(uint8x16_t); - } search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; @@ -422,10 +415,6 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } - if (popcnt >= sizeof(uint8x16_t)/2) { - return remaining; - } - search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; @@ -514,10 +503,6 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se continue; } - if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { - return sizeof(__m128i); - } - search->has_matches = 1; search->matches_mask = needs_escape_mask; search->chunk_base = search->ptr; From 7b802e9b30162d6a09857fb8635a7c9b4c797d78 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 25 Apr 2025 08:05:27 -0500 Subject: [PATCH 37/51] A different fix for handling characters that do not need to be escaped. --- ext/json/ext/generator/generator.c | 33 +++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 03ec66957..41901116a 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -188,11 +188,15 @@ static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) case '\r': fbuffer_append(search->buffer, "\\r", 2); break; case '\t': fbuffer_append(search->buffer, "\\t", 2); break; default: { - const char *hexdig = "0123456789abcdef"; - char scratch[6] = { '\\', 'u', '0', '0', 0, 0 }; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(search->buffer, scratch, 6); + if (ch < 0x32) { + const char *hexdig = "0123456789abcdef"; + char scratch[6] = { '\\', 'u', '0', '0', 0, 0 }; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(search->buffer, scratch, 6); + } else { + fbuffer_append_char(search->buffer, ch); + } break; } } @@ -218,8 +222,11 @@ static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) */ static inline void convert_UTF8_to_JSON(search_state *search) { - while (search_escape_basic_impl(search)) { - escape_UTF8_char_basic(search); + unsigned char num_chars = 0; + while ((num_chars = search_escape_basic_impl(search))) { + do { + escape_UTF8_char_basic(search); + } while (--num_chars); } } @@ -390,6 +397,10 @@ static inline unsigned char search_escape_basic_neon(search_state *search) search->ptr += sizeof(uint8x16_t); continue; } + + if (popcnt >= sizeof(uint8x16_t)/2) { + return sizeof(uint8x16_t); + } search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; @@ -415,6 +426,10 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } + if (popcnt >= sizeof(uint8x16_t)/2) { + return remaining; + } + search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; @@ -503,6 +518,10 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se continue; } + if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { + return sizeof(__m128i); + } + search->has_matches = 1; search->matches_mask = needs_escape_mask; search->chunk_base = search->ptr; From 0951730d093b918f129fd94cbae4d3809c38c970 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 25 Apr 2025 21:09:17 -0500 Subject: [PATCH 38/51] Added tests of various lengths to ensure the SIMD escaping code works as expected. --- test/json/json_generator_test.rb | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index f86ce9aeb..0eb31828b 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -495,6 +495,42 @@ def test_backslash data = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"a" json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"a\"" assert_equal json, generate(data) + data = "\u0001\u0001\u0001\u0001" + json = "\"\\u0001\\u0001\\u0001\\u0001\"" + assert_equal json, generate(data) + data = "\u0001a\u0001a\u0001a\u0001a" + json = "\"\\u0001a\\u0001a\\u0001a\\u0001a\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002" + json = "\"\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\"" + assert_equal json, generate(data) + data = "ab\u0002c" + json = "\"ab\\u0002c\"" + assert_equal json, generate(data) + data = "ab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal json, generate(data) + data = "ab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal json, generate(data) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\"" + assert_equal json, generate(data) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\"" + assert_equal json, generate(data) + data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\t" + json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\"" + assert_equal json, generate(data) end def test_string_subclass From b689be94cdcf5fa73dcb2911b378c0e26c3e5ef7 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 25 Apr 2025 21:51:36 -0500 Subject: [PATCH 39/51] Small bugfix. --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 41901116a..e0fac9422 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -188,7 +188,7 @@ static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) case '\r': fbuffer_append(search->buffer, "\\r", 2); break; case '\t': fbuffer_append(search->buffer, "\\t", 2); break; default: { - if (ch < 0x32) { + if (ch < ' ') { const char *hexdig = "0123456789abcdef"; char scratch[6] = { '\\', 'u', '0', '0', 0, 0 }; scratch[4] = hexdig[(ch >> 4) & 0xf]; From a8f3a0a9043dc09998211767b409dc6d1abfa187 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 08:43:16 +0200 Subject: [PATCH 40/51] Small comment typo --- ext/json/ext/generator/generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index e0fac9422..a795dc3c4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -134,7 +134,7 @@ typedef struct _search_state { static inline FORCE_INLINE void search_flush(search_state *search) { - // Do not remove this conditional without profiling, specfically escape-heavy text. + // Do not remove this conditional without profiling, specifically escape-heavy text. // escape_UTF8_char_basic will advance search->ptr and search->cursor (effectively a search_flush). // For back-to-back characters that need to be escaped, specifcally for the SIMD code paths, this method // will be called just before calling escape_UTF8_char_basic. There will be no characers to append for the From 483286065b69897d14d5067f9cda0e759feea68d Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 09:08:50 +0200 Subject: [PATCH 41/51] style --- ext/json/ext/generator/generator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a795dc3c4..1fbc614d2 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -340,10 +340,10 @@ static inline unsigned char search_escape_basic_neon(search_state *search) // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; - if (RB_UNLIKELY(search->chunk_base+sizeof(uint8x16_t) >= search->end)) { + if (RB_UNLIKELY(search->chunk_base + sizeof(uint8x16_t) >= search->end)) { search->ptr = search->end; } else { - search->ptr = search->chunk_base+sizeof(uint8x16_t); + search->ptr = search->chunk_base + sizeof(uint8x16_t); } } } From 7f1b95a19b9a6524dd84775e552cfa27eb0d804b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 10:17:04 +0200 Subject: [PATCH 42/51] doc --- ext/json/ext/generator/generator.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 1fbc614d2..2180b3d73 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -279,6 +279,8 @@ static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsi char *s = (buf->ptr + buf->len); + // Pad the buffer with dummy characters that won't need escaping. + // This seem wateful at first sight, but memset of vector length is very fast. memset(s, 'X', vec_len); // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters From 28b73a9d3ae0392170a0fdc94bb71e6c0f3020a7 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 10:20:18 +0200 Subject: [PATCH 43/51] style and typos --- ext/json/ext/generator/generator.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 2180b3d73..d88a82665 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -356,7 +356,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) * * Assume the ptr = "Te\sting!" (the double quotes are included in the string) * - * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * The explanation will be limited to the first 8 bytes of the string for simplicity. However * the vector insructions may work on larger vectors. * * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. @@ -390,7 +390,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) * * If the sum is greater than or equal to 8, then we can assume that at least half of the bytes in chunk. */ - while (search->ptr+sizeof(uint8x16_t) <= search->end) { + while (search->ptr + sizeof(uint8x16_t) <= search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_rules_update(chunk); uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); @@ -501,15 +501,15 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se // sse2_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; - if (RB_UNLIKELY(search->chunk_base+sizeof(__m128i) >= search->end)) { + if (RB_UNLIKELY(search->chunk_base + sizeof(__m128i) >= search->end)) { search->ptr = search->end; } else { - search->ptr = search->chunk_base+sizeof(__m128i); + search->ptr = search->chunk_base + sizeof(__m128i); } } } - while (search->ptr+sizeof(__m128i) <= search->end) { + while (search->ptr + sizeof(__m128i) <= search->end) { __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); __m128i needs_escape = sse2_update(chunk); From 0e338143f89a40c05179501a06a63ac9bc54b5a1 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 10:23:07 +0200 Subject: [PATCH 44/51] Factorize more --- ext/json/ext/generator/generator.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d88a82665..be3e7cd70 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -318,8 +318,10 @@ static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) return mask & 0x8888888888888888ull; } -static inline FORCE_INLINE uint8x16_t neon_rules_update(uint8x16_t chunk) +static inline FORCE_INLINE uint8x16_t neon_rules_update(const char *ptr) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr); + const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); @@ -391,8 +393,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) * If the sum is greater than or equal to 8, then we can assume that at least half of the bytes in chunk. */ while (search->ptr + sizeof(uint8x16_t) <= search->end) { - uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t needs_escape = neon_rules_update(chunk); + uint8x16_t needs_escape = neon_rules_update(search->ptr); uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); if (popcnt == 0) { @@ -415,8 +416,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); - uint8x16_t chunk = vld1q_u8((const unsigned char *) s); - uint8x16_t needs_escape = neon_rules_update(chunk); + uint8x16_t needs_escape = neon_rules_update(s); uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); if (popcnt == 0) { From af859c21912d0608dfaeb10293bcff3a4b0d143b Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 10:32:12 +0200 Subject: [PATCH 45/51] Skip the popcount step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my machine is perform as well if not sligthly better, less code and less conditional seem attractive. ``` == Encoding activitypub.json (52595 bytes) ruby 3.4.2 (2025-02-15 revision d2930f8e7a) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 3.026k i/100ms Calculating ------------------------------------- after 30.476k (± 1.6%) i/s (32.81 μs/i) - 154.326k in 5.065214s Comparison: before: 29732.0 i/s after: 30476.0 i/s - 1.03x faster == Encoding citm_catalog.json (500298 bytes) ruby 3.4.2 (2025-02-15 revision d2930f8e7a) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 153.000 i/100ms Calculating ------------------------------------- after 1.547k (± 0.7%) i/s (646.38 μs/i) - 7.803k in 5.043956s Comparison: before: 1561.9 i/s after: 1547.1 i/s - same-ish: difference falls within error == Encoding twitter.json (466906 bytes) ruby 3.4.2 (2025-02-15 revision d2930f8e7a) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 309.000 i/100ms Calculating ------------------------------------- after 3.108k (± 1.1%) i/s (321.74 μs/i) - 15.759k in 5.070855s Comparison: before: 3012.7 i/s after: 3108.1 i/s - 1.03x faster ``` --- ext/json/ext/generator/generator.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index be3e7cd70..0e5ce5402 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -331,6 +331,8 @@ static inline FORCE_INLINE uint8x16_t neon_rules_update(const char *ptr) uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + vandq_u8(needs_escape, vdupq_n_u8(0x1)); + return needs_escape; } @@ -394,18 +396,13 @@ static inline unsigned char search_escape_basic_neon(search_state *search) */ while (search->ptr + sizeof(uint8x16_t) <= search->end) { uint8x16_t needs_escape = neon_rules_update(search->ptr); - uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + uint64_t mask = neon_match_mask(needs_escape); - if (popcnt == 0) { + if (!mask) { search->ptr += sizeof(uint8x16_t); continue; } - - if (popcnt >= sizeof(uint8x16_t)/2) { - return sizeof(uint8x16_t); - } - - search->matches_mask = neon_match_mask(needs_escape); + search->matches_mask = mask; search->has_matches = 1; search->chunk_base = search->ptr; return neon_next_match(search); @@ -417,9 +414,9 @@ static inline unsigned char search_escape_basic_neon(search_state *search) char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); uint8x16_t needs_escape = neon_rules_update(s); - uint8_t popcnt = vaddvq_u8(vandq_u8(needs_escape, vdupq_n_u8(0x1))); + uint64_t mask = neon_match_mask(needs_escape); - if (popcnt == 0) { + if (!mask) { // Nothing to escape, ensure search_flush doesn't do anything by setting // search->cursor to search->ptr. search->buffer->len += remaining; @@ -428,10 +425,6 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } - if (popcnt >= sizeof(uint8x16_t)/2) { - return remaining; - } - search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; search->chunk_base = search->ptr; From e3ba02af4926d8c8f278250ddbb1cc5dc4536c11 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 12:01:33 +0200 Subject: [PATCH 46/51] Missing end line --- ext/json/ext/generator/simd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 0472c033c..d2f235191 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -123,4 +123,4 @@ static SIMD_Implementation find_simd_implementation(void) { static SIMD_Implementation find_simd_implementation(void) { return SIMD_NONE; } -#endif \ No newline at end of file +#endif From c999baf5bb492da603e206b39b760eaf55aa98b0 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 27 Apr 2025 12:08:29 +0200 Subject: [PATCH 47/51] Add a CI step with simd disabled --- .github/workflows/ci.yml | 31 +++++++++++++++++-------------- ext/json/ext/generator/extconf.rb | 4 ++-- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9574d30c..b6f82ae90 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,26 +14,29 @@ jobs: host: needs: ruby-versions - name: ${{ matrix.os }} ${{ matrix.ruby }} + name: ${{ matrix.os }} ${{ matrix.ruby }} ${{ matrix.env }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: - - ubuntu-latest - - macos-14 - - windows-latest + - ubuntu-latest + - macos-14 + - windows-latest ruby: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + env: + - "" include: - - { os: ubuntu-24.04-arm, ruby: 3.4 } - - { os: macos-13, ruby: 3.4 } - - { os: windows-latest , ruby: mswin } # ruby/ruby windows CI - - { os: ubuntu-latest , ruby: jruby-9.4 } # Ruby 3.1 - - { os: macos-latest , ruby: truffleruby-head } - - { os: ubuntu-latest , ruby: truffleruby-head } + - { os: ubuntu-24.04-arm, ruby: 3.4 } + - { os: ubuntu-latest , ruby: 3.4, env: "JSON_DISABLE_SIMD=1" } + - { os: macos-13, ruby: 3.4 } + - { os: windows-latest , ruby: mswin } # ruby/ruby windows CI + - { os: ubuntu-latest , ruby: jruby-9.4 } # Ruby 3.1 + - { os: macos-latest , ruby: truffleruby-head } + - { os: ubuntu-latest , ruby: truffleruby-head } exclude: - - { os: windows-latest, ruby: jruby } - - { os: windows-latest, ruby: jruby-head } + - { os: windows-latest, ruby: jruby } + - { os: windows-latest, ruby: jruby-head } steps: - uses: actions/checkout@v4 @@ -49,9 +52,9 @@ jobs: bundle config --without benchmark bundle install - - run: rake compile + - run: rake compile ${{ matrix.env }} - - run: rake test JSON_COMPACT=1 + - run: rake test JSON_COMPACT=1 ${{ matrix.env }} - run: rake build diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 2ebac48b1..4fbeb5f33 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -7,7 +7,7 @@ append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" - if enable_config('generator-use-simd', default=true) + if enable_config('generator-use-simd', default=!ENV["JSON_DISABLE_SIMD"]) if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ # Try to compile a small program using NEON instructions if have_header('arm_neon.h') @@ -31,7 +31,7 @@ SRC $defs.push("-DENABLE_SIMD") end - + have_header('cpuid.h') end From 5de293a69ca9ca2cfa6c104c830444b93313d041 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 27 Apr 2025 12:50:14 -0500 Subject: [PATCH 48/51] Removed unnecessary code and fixed a comment. --- ext/json/ext/generator/generator.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 0e5ce5402..aa268b2e1 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -331,8 +331,6 @@ static inline FORCE_INLINE uint8x16_t neon_rules_update(const char *ptr) uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); - vandq_u8(needs_escape, vdupq_n_u8(0x1)); - return needs_escape; } @@ -390,9 +388,9 @@ static inline unsigned char search_escape_basic_neon(search_state *search) * Next we compute the bitwise AND between each byte and 0x1 and compute the horizontal sum of * the values in the vector. This computes how many bytes need to be escaped within this chunk. * - * If the sum is zero, no bytes need to be escaped and we can skip 16 bytes. - * - * If the sum is greater than or equal to 8, then we can assume that at least half of the bytes in chunk. + * Finally we compute a mask that indicates which bytes need to be escaped. If the mask is 0 then, + * no bytes need to be escaped and we can continue to the next chunk. If the mask is not 0 then we + * have at least one byte that needs to be escaped. */ while (search->ptr + sizeof(uint8x16_t) <= search->end) { uint8x16_t needs_escape = neon_rules_update(search->ptr); From 56c34a4eaf1edea469cbea29f9078bb4477fef1a Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 27 Apr 2025 20:25:28 -0500 Subject: [PATCH 49/51] Simplify updatig search->ptr when there are no more matches in a chunk. --- ext/json/ext/generator/generator.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index aa268b2e1..89153ede2 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -114,6 +114,7 @@ typedef struct _search_state { #ifdef ENABLE_SIMD const char *chunk_base; + const char *chunk_end; uint8_t has_matches; #ifdef HAVE_SIMD_NEON @@ -344,11 +345,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. search->has_matches = 0; - if (RB_UNLIKELY(search->chunk_base + sizeof(uint8x16_t) >= search->end)) { - search->ptr = search->end; - } else { - search->ptr = search->chunk_base + sizeof(uint8x16_t); - } + search->ptr = search->chunk_end; } } @@ -403,6 +400,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) search->matches_mask = mask; search->has_matches = 1; search->chunk_base = search->ptr; + search->chunk_end = search->ptr + sizeof(uint8x16_t); return neon_next_match(search); } @@ -425,6 +423,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) search->matches_mask = neon_match_mask(needs_escape); search->has_matches = 1; + search->chunk_end = search->end; search->chunk_base = search->ptr; return neon_next_match(search); } From e50b5df890a7b28264027c1103414bdbe0b0c48d Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 28 Apr 2025 08:17:54 +0200 Subject: [PATCH 50/51] Dont do popcount in sse2 path either --- ext/json/ext/generator/generator.c | 31 ++++++++++-------------------- ext/json/ext/generator/simd.h | 14 -------------- 2 files changed, 10 insertions(+), 35 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 89153ede2..9c49eecde 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -115,7 +115,7 @@ typedef struct _search_state { #ifdef ENABLE_SIMD const char *chunk_base; const char *chunk_end; - uint8_t has_matches; + bool has_matches; #ifdef HAVE_SIMD_NEON uint64_t matches_mask; @@ -223,11 +223,8 @@ static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) */ static inline void convert_UTF8_to_JSON(search_state *search) { - unsigned char num_chars = 0; - while ((num_chars = search_escape_basic_impl(search))) { - do { - escape_UTF8_char_basic(search); - } while (--num_chars); + while (search_escape_basic_impl(search)) { + escape_UTF8_char_basic(search); } } @@ -344,7 +341,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } else { // neon_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. - search->has_matches = 0; + search->has_matches = false; search->ptr = search->chunk_end; } } @@ -398,7 +395,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) continue; } search->matches_mask = mask; - search->has_matches = 1; + search->has_matches = true; search->chunk_base = search->ptr; search->chunk_end = search->ptr + sizeof(uint8x16_t); return neon_next_match(search); @@ -422,7 +419,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } search->matches_mask = neon_match_mask(needs_escape); - search->has_matches = 1; + search->has_matches = true; search->chunk_end = search->end; search->chunk_base = search->ptr; return neon_next_match(search); @@ -490,7 +487,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se } else { // sse2_next_match will only advance search->ptr up to the last matching character. // Skip over any characters in the last chunk that occur after the last match. - search->has_matches = 0; + search->has_matches = false; if (RB_UNLIKELY(search->chunk_base + sizeof(__m128i) >= search->end)) { search->ptr = search->end; } else { @@ -510,11 +507,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se continue; } - if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { - return sizeof(__m128i); - } - - search->has_matches = 1; + search->has_matches = true; search->matches_mask = needs_escape_mask; search->chunk_base = search->ptr; return sse2_next_match(search); @@ -539,11 +532,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se return 0; } - if (popcount32(needs_escape_mask) >= sizeof(__m128i)/2) { - return remaining; - } - - search->has_matches = 1; + search->has_matches = true; search->matches_mask = needs_escape_mask; search->chunk_base = search->ptr; return sse2_next_match(search); @@ -1310,7 +1299,7 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat #ifdef ENABLE_SIMD search.matches_mask = 0; - search.has_matches = 0; + search.has_matches = false; search.chunk_base = NULL; #endif /* ENABLE_SIMD */ diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index d2f235191..4deb97b4d 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -48,20 +48,6 @@ static inline int trailing_zeros(int input) { #endif } -uint32_t popcount32(uint32_t x) { - #if defined(__GNUC__) || defined(__clang__) - return __builtin_popcount(x); - #elif defined(__ARM_NEON) - #include - return vaddv_u8(vcnt_u8(vcreate_u8((uint64_t)x))) & 0xFF; - #else - x = x - ((x >> 1) & 0x55555555); - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - x = (x + (x >> 4)) & 0x0F0F0F0F; - return (x * 0x01010101) >> 24; - #endif -} - #define SIMD_MINIMUM_THRESHOLD 6 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) From 85734e7fae7b54a691961b76ed14a48a6c53f8d3 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 28 Apr 2025 08:36:48 +0200 Subject: [PATCH 51/51] Simplify the SIMD interface further --- ext/json/ext/generator/generator.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 9c49eecde..f7a5a864e 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -316,7 +316,7 @@ static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) return mask & 0x8888888888888888ull; } -static inline FORCE_INLINE uint8x16_t neon_rules_update(const char *ptr) +static inline FORCE_INLINE uint64_t neon_rules_update(const char *ptr) { uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr); @@ -329,7 +329,7 @@ static inline FORCE_INLINE uint8x16_t neon_rules_update(const char *ptr) uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); - return needs_escape; + return neon_match_mask(needs_escape); } static inline unsigned char search_escape_basic_neon(search_state *search) @@ -387,8 +387,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) * have at least one byte that needs to be escaped. */ while (search->ptr + sizeof(uint8x16_t) <= search->end) { - uint8x16_t needs_escape = neon_rules_update(search->ptr); - uint64_t mask = neon_match_mask(needs_escape); + uint64_t mask = neon_rules_update(search->ptr); if (!mask) { search->ptr += sizeof(uint8x16_t); @@ -406,8 +405,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); - uint8x16_t needs_escape = neon_rules_update(s); - uint64_t mask = neon_match_mask(needs_escape); + uint64_t mask = neon_rules_update(s); if (!mask) { // Nothing to escape, ensure search_flush doesn't do anything by setting @@ -418,7 +416,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } - search->matches_mask = neon_match_mask(needs_escape); + search->matches_mask = mask; search->has_matches = true; search->chunk_end = search->end; search->chunk_base = search->ptr; @@ -465,8 +463,10 @@ static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) #define TARGET_SSE2 #endif -static inline TARGET_SSE2 FORCE_INLINE __m128i sse2_update(__m128i chunk) +static inline TARGET_SSE2 FORCE_INLINE int sse2_update(const char *ptr) { + __m128i chunk = _mm_loadu_si128((__m128i const*)ptr); + const __m128i lower_bound = _mm_set1_epi8(' '); const __m128i backslash = _mm_set1_epi8('\\'); const __m128i dblquote = _mm_set1_epi8('\"'); @@ -475,7 +475,7 @@ static inline TARGET_SSE2 FORCE_INLINE __m128i sse2_update(__m128i chunk) __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); - return needs_escape; + return _mm_movemask_epi8(needs_escape); } static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search) @@ -497,10 +497,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se } while (search->ptr + sizeof(__m128i) <= search->end) { - __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); - __m128i needs_escape = sse2_update(chunk); - - int needs_escape_mask = _mm_movemask_epi8(needs_escape); + int needs_escape_mask = sse2_update(search->ptr); if (needs_escape_mask == 0) { search->ptr += sizeof(__m128i); @@ -518,10 +515,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se if (remaining >= SIMD_MINIMUM_THRESHOLD) { char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining); - __m128i chunk = _mm_loadu_si128((__m128i const *) s); - __m128i needs_escape = sse2_update(chunk); - - int needs_escape_mask = _mm_movemask_epi8(needs_escape); + int needs_escape_mask = sse2_update(s); if (needs_escape_mask == 0) { // Nothing to escape, ensure search_flush doesn't do anything by setting