From 8454149d034ab7e8d1c176bfb0cddd3842088981 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 23 Aug 2025 19:57:14 +0200 Subject: [PATCH] JSON.generate: warn or raise on duplicated key Because both strings and symbols keys are serialized the same, it always has been possible to generate documents with duplicated keys: ```ruby >> puts JSON.generate({ foo: 1, "foo" => 2 }) {"foo":1,"foo":2} ``` This is pretty much always a mistake and can cause various issues because it's not guaranteed how various JSON parsers will handle this. Until now I didn't think it was possible to catch such case without tanking performance, hence why I only made the parser more strict. But I finally found a way to check for duplicated keys cheaply enough. --- CHANGES.md | 11 ++++ ext/json/ext/fbuffer/fbuffer.h | 8 +++ ext/json/ext/generator/generator.c | 73 ++++++++++++++++++++++++--- ext/json/ext/parser/parser.c | 2 +- java/src/json/ext/Generator.java | 66 ++++++++++++++++++++++++ java/src/json/ext/GeneratorState.java | 39 ++++++++++++++ lib/json/common.rb | 19 +++++++ lib/json/ext/generator/state.rb | 5 ++ lib/json/truffle_ruby/generator.rb | 37 +++++++++++++- test/json/json_generator_test.rb | 38 ++++++++++++++ 10 files changed, 289 insertions(+), 9 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 670b2d3ef..779dd3ef7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,17 @@ ### Unreleased +* Add new `allow_duplicate_key` generator options. By default a warning is now emitted when a duplicated key is encountered. + In `json 3.0` an error will be raised. + ```ruby + >> Warning[:deprecated] = true + >> puts JSON.generate({ foo: 1, "foo" => 2 }) + (irb):2: warning: detected duplicate key "foo" in {foo: 1, "foo" => 2}. + This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true` + {"foo":1,"foo":2} + >> JSON.generate({ foo: 1, "foo" => 2 }, allow_duplicate_key: false) + detected duplicate key "foo" in {foo: 1, "foo" => 2} (JSON::GeneratorError) + ``` * Fix `JSON.generate` `strict: true` mode to also restrict hash keys. * Fix `JSON::Coder` to also invoke block for hash keys that aren't strings nor symbols. diff --git a/ext/json/ext/fbuffer/fbuffer.h b/ext/json/ext/fbuffer/fbuffer.h index dc40dec7f..1f6461b0b 100644 --- a/ext/json/ext/fbuffer/fbuffer.h +++ b/ext/json/ext/fbuffer/fbuffer.h @@ -24,6 +24,14 @@ typedef unsigned char _Bool; #endif #endif +#ifndef NOINLINE +#if defined(__has_attribute) && __has_attribute(noinline) +#define NOINLINE() __attribute__((noinline)) +#else +#define NOINLINE() +#endif +#endif + #ifndef RB_UNLIKELY #define RB_UNLIKELY(expr) expr #endif diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 52dcd24f0..d810927c5 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -9,6 +9,12 @@ /* ruby api and some helpers */ +enum duplicate_key_action { + JSON_DEPRECATED = 0, + JSON_IGNORE, + JSON_RAISE, +}; + typedef struct JSON_Generator_StateStruct { VALUE indent; VALUE space; @@ -21,6 +27,8 @@ typedef struct JSON_Generator_StateStruct { long depth; long buffer_initial_length; + enum duplicate_key_action on_duplicate_key; + bool allow_nan; bool ascii_only; bool script_safe; @@ -34,7 +42,7 @@ typedef struct JSON_Generator_StateStruct { static VALUE mJSON, cState, cFragment, eGeneratorError, eNestingError, Encoding_UTF_8; static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode; -static VALUE sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan, +static VALUE sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan, sym_allow_duplicate_key, sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict, sym_as_json; @@ -987,8 +995,11 @@ static inline VALUE vstate_get(struct generate_json_data *data) } struct hash_foreach_arg { + VALUE hash; struct generate_json_data *data; - int iter; + int first_key_type; + bool first; + bool mixed_keys_encountered; }; static VALUE @@ -1006,6 +1017,22 @@ convert_string_subclass(VALUE key) return key_to_s; } +NOINLINE() +static void +json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg) +{ + if (arg->mixed_keys_encountered) { + return; + } + arg->mixed_keys_encountered = true; + + JSON_Generator_State *state = arg->data->state; + if (state->on_duplicate_key != JSON_IGNORE) { + VALUE do_raise = state->on_duplicate_key == JSON_RAISE ? Qtrue : Qfalse; + rb_funcall(mJSON, rb_intern("on_mixed_keys_hash"), 2, arg->hash, do_raise); + } +} + static int json_object_i(VALUE key, VALUE val, VALUE _arg) { @@ -1016,8 +1043,16 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) JSON_Generator_State *state = data->state; long depth = state->depth; + int key_type = rb_type(key); + + if (arg->first) { + arg->first = false; + arg->first_key_type = key_type; + } + else { + fbuffer_append_char(buffer, ','); + } - if (arg->iter > 0) fbuffer_append_char(buffer, ','); if (RB_UNLIKELY(data->state->object_nl)) { fbuffer_append_str(buffer, data->state->object_nl); } @@ -1029,8 +1064,12 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) bool as_json_called = false; start: - switch (rb_type(key)) { + switch (key_type) { case T_STRING: + if (RB_UNLIKELY(arg->first_key_type != T_STRING)) { + json_inspect_hash_with_mixed_keys(arg); + } + if (RB_LIKELY(RBASIC_CLASS(key) == rb_cString)) { key_to_s = key; } else { @@ -1038,12 +1077,17 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) } break; case T_SYMBOL: + if (RB_UNLIKELY(arg->first_key_type != T_SYMBOL)) { + json_inspect_hash_with_mixed_keys(arg); + } + key_to_s = rb_sym2str(key); break; default: if (data->state->strict) { if (RTEST(data->state->as_json) && !as_json_called) { key = rb_proc_call_with_block(data->state->as_json, 1, &key, Qnil); + key_type = rb_type(key); as_json_called = true; goto start; } else { @@ -1064,7 +1108,6 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) if (RB_UNLIKELY(state->space)) fbuffer_append_str(buffer, data->state->space); generate_json(buffer, data, val); - arg->iter++; return ST_CONTINUE; } @@ -1091,8 +1134,9 @@ static void generate_json_object(FBuffer *buffer, struct generate_json_data *dat fbuffer_append_char(buffer, '{'); struct hash_foreach_arg arg = { + .hash = obj, .data = data, - .iter = 0, + .first = true, }; rb_hash_foreach(obj, json_object_i, (VALUE)&arg); @@ -1794,6 +1838,19 @@ static VALUE cState_ascii_only_set(VALUE self, VALUE enable) return Qnil; } +static VALUE cState_allow_duplicate_key_p(VALUE self) +{ + GET_STATE(self); + switch (state->on_duplicate_key) { + case JSON_IGNORE: + return Qtrue; + case JSON_DEPRECATED: + return Qnil; + case JSON_RAISE: + return Qfalse; + } +} + /* * call-seq: depth * @@ -1883,6 +1940,7 @@ static int configure_state_i(VALUE key, VALUE val, VALUE _arg) else if (key == sym_script_safe) { state->script_safe = RTEST(val); } else if (key == sym_escape_slash) { state->script_safe = RTEST(val); } else if (key == sym_strict) { state->strict = RTEST(val); } + else if (key == sym_allow_duplicate_key) { state->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } else if (key == sym_as_json) { VALUE proc = RTEST(val) ? rb_convert_type(val, T_DATA, "Proc", "to_proc") : Qfalse; state_write_value(data, &state->as_json, proc); @@ -2008,6 +2066,8 @@ void Init_generator(void) rb_define_method(cState, "generate", cState_generate, -1); rb_define_alias(cState, "generate_new", "generate"); // :nodoc: + rb_define_private_method(cState, "allow_duplicate_key?", cState_allow_duplicate_key_p, 0); + rb_define_singleton_method(cState, "generate", cState_m_generate, 3); VALUE mGeneratorMethods = rb_define_module_under(mGenerator, "GeneratorMethods"); @@ -2072,6 +2132,7 @@ void Init_generator(void) sym_escape_slash = ID2SYM(rb_intern("escape_slash")); sym_strict = ID2SYM(rb_intern("strict")); sym_as_json = ID2SYM(rb_intern("as_json")); + sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key")); usascii_encindex = rb_usascii_encindex(); utf8_encindex = rb_utf8_encindex(); diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 1e6ee753f..00b71eb61 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1314,7 +1314,7 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data) else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); } else if (key == sym_freeze) { config->freeze = RTEST(val); } else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; } - else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } + else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } else if (key == sym_decimal_class) { if (RTEST(val)) { if (rb_respond_to(val, i_try_convert)) { diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java index 31f175cd7..94436cc4a 100644 --- a/java/src/json/ext/Generator.java +++ b/java/src/json/ext/Generator.java @@ -488,6 +488,66 @@ void generate(ThreadContext context, final Session session, RubyHash object, fin } } + private static class HashKeyTracker { + enum KeyType { + UNKNOWN, + STRING, + SYMBOL, + }; + + private KeyType keyType; + private boolean done; + private RubyHash hash; + + private HashKeyTracker(RubyHash hash) { + this.hash = hash; + this.done = false; + this.keyType = KeyType.UNKNOWN; + } + + public void trackFirst(ThreadContext context, Session session, IRubyObject key) { + if (key instanceof RubyString) { + this.keyType = KeyType.STRING; + } else if (key.getType() == context.runtime.getSymbol()) { + this.keyType = KeyType.SYMBOL; + } else { + this.done = true; + report(context, session); + } + } + + public void track(ThreadContext context, Session session, IRubyObject key) { + if (!done) { + if (keyType == KeyType.STRING) { + if (!(key instanceof RubyString)) { + this.report(context, session); + } + } else { + if (!(key.getType() == context.runtime.getSymbol())) { + this.report(context, session); + } + } + } + } + + private void report(ThreadContext context, Session session) { + this.done = true; + + final RuntimeInfo info = session.getInfo(context); + final GeneratorState state = session.getState(context); + + if (!state.getAllowDuplicateKey()) { + if (state.getDeprecateDuplicateKey()) { + IRubyObject args[] = new IRubyObject[]{hash, context.getRuntime().getFalse()}; + info.jsonModule.get().callMethod(context, "on_mixed_keys_hash", args); + } else { + IRubyObject args[] = new IRubyObject[]{hash, context.getRuntime().getTrue()}; + info.jsonModule.get().callMethod(context, "on_mixed_keys_hash", args); + } + } + } + } + static void generateHash(ThreadContext context, Session session, RubyHash object, OutputStream buffer) throws IOException { final GeneratorState state = session.getState(context); final int depth = state.increaseDepth(context); @@ -508,7 +568,13 @@ static void generateHash(ThreadContext context, Session session, RubyHash object buffer.write(objectNLBytes); boolean firstPair = true; + HashKeyTracker tracker = new HashKeyTracker(object); for (RubyHash.RubyHashEntry entry : (Set) object.directEntrySet()) { + if (firstPair) { + tracker.trackFirst(context, session, (IRubyObject)entry.getKey()); + } else { + tracker.track(context, session, (IRubyObject)entry.getKey()); + } processEntry(context, session, buffer, entry, firstPair, objectNl, indent, spaceBefore, space); firstPair = false; } diff --git a/java/src/json/ext/GeneratorState.java b/java/src/json/ext/GeneratorState.java index 142d388dc..f9e8234e6 100644 --- a/java/src/json/ext/GeneratorState.java +++ b/java/src/json/ext/GeneratorState.java @@ -34,6 +34,9 @@ * @author mernen */ public class GeneratorState extends RubyObject { + private boolean allowDuplicateKey = false; + private boolean deprecateDuplicateKey = true; + /** * The indenting unit string. Will be repeated several times for larger * indenting levels. @@ -216,6 +219,10 @@ public IRubyObject initialize_copy(ThreadContext context, IRubyObject vOrig) { this.strict = orig.strict; this.bufferInitialLength = orig.bufferInitialLength; this.depth = orig.depth; + + this.allowDuplicateKey = orig.allowDuplicateKey; + this.deprecateDuplicateKey = orig.deprecateDuplicateKey; + return this; } @@ -479,6 +486,24 @@ private ByteList prepareByteList(ThreadContext context, IRubyObject value) { return str.getByteList().dup(); } + @JRubyMethod(name="allow_duplicate_key?", visibility=Visibility.PRIVATE) + public IRubyObject allow_duplicate_key_p(ThreadContext context) { + if (allowDuplicateKey) { + return context.runtime.getTrue(); + } else if (deprecateDuplicateKey) { + return context.runtime.getNil(); + } + return context.runtime.getFalse(); + } + + public boolean getAllowDuplicateKey() { + return allowDuplicateKey; + } + + public boolean getDeprecateDuplicateKey() { + return deprecateDuplicateKey; + } + /** * State#configure(opts) * @@ -520,6 +545,10 @@ public IRubyObject _configure(ThreadContext context, IRubyObject vOpts) { depth = opts.getInt("depth", 0); + if (opts.hasKey("allow_duplicate_key")) { + this.allowDuplicateKey = opts.getBool("allow_duplicate_key", false); + this.deprecateDuplicateKey = false; + } return this; } @@ -548,6 +577,16 @@ public RubyHash to_h(ThreadContext context) { result.op_aset(context, runtime.newSymbol("strict"), strict_get(context)); result.op_aset(context, runtime.newSymbol("depth"), depth_get(context)); result.op_aset(context, runtime.newSymbol("buffer_initial_length"), buffer_initial_length_get(context)); + + if (this.allowDuplicateKey) { + if (!this.deprecateDuplicateKey) { + result.op_aset(context, runtime.newSymbol("allow_duplicate_key"), runtime.getTrue()); + } + } + else { + result.op_aset(context, runtime.newSymbol("allow_duplicate_key"), runtime.getFalse()); + } + for (String name: getInstanceVariableNameList()) { result.op_aset(context, runtime.newSymbol(name.substring(1)), getInstanceVariables().getInstanceVariable(name)); } diff --git a/lib/json/common.rb b/lib/json/common.rb index e99d152a8..7222a9d2d 100644 --- a/lib/json/common.rb +++ b/lib/json/common.rb @@ -186,6 +186,25 @@ def generator=(generator) # :nodoc: private + # Called from the extension when a hash has both string and symbol keys + def on_mixed_keys_hash(hash, do_raise) + set = {} + hash.each_key do |key| + key_str = key.to_s + + if set[key_str] + message = "detected duplicate key #{key_str.inspect} in #{hash.inspect}" + if do_raise + raise GeneratorError, message + else + deprecation_warning("#{message}.\nThis will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`") + end + else + set[key_str] = true + end + end + end + def deprecated_singleton_attr_accessor(*attrs) args = RUBY_VERSION >= "3.0" ? ", category: :deprecated" : "" attrs.each do |attr| diff --git a/lib/json/ext/generator/state.rb b/lib/json/ext/generator/state.rb index d40c3b5ec..d2af1631a 100644 --- a/lib/json/ext/generator/state.rb +++ b/lib/json/ext/generator/state.rb @@ -68,6 +68,11 @@ def to_h buffer_initial_length: buffer_initial_length, } + allow_duplicate_key = allow_duplicate_key? + unless allow_duplicate_key.nil? + result[:allow_duplicate_key] = allow_duplicate_key + end + instance_variables.each do |iv| iv = iv.to_s[1..-1] result[iv.to_sym] = self[iv] diff --git a/lib/json/truffle_ruby/generator.rb b/lib/json/truffle_ruby/generator.rb index 6facf0364..8be312f7a 100644 --- a/lib/json/truffle_ruby/generator.rb +++ b/lib/json/truffle_ruby/generator.rb @@ -271,6 +271,12 @@ def configure(opts) false end + if opts.key?(:allow_duplicate_key) + @allow_duplicate_key = !!opts[:allow_duplicate_key] + else + @allow_duplicate_key = nil # nil is deprecation + end + @strict = !!opts[:strict] if opts.key?(:strict) if !opts.key?(:max_nesting) # defaults to 100 @@ -284,6 +290,10 @@ def configure(opts) end alias merge configure + def allow_duplicate_key? # :nodoc: + @allow_duplicate_key + end + # Returns the configuration instance variables as a hash, that can be # passed to the configure method. def to_h @@ -292,6 +302,11 @@ def to_h iv = iv.to_s[1..-1] result[iv.to_sym] = self[iv] end + + if result[:allow_duplicate_key].nil? + result.delete(:allow_duplicate_key) + end + result end @@ -330,8 +345,17 @@ def generate_new(obj, anIO = nil) # :nodoc: when Hash buf << '{' first = true + key_type = nil obj.each_pair do |k,v| - buf << ',' unless first + if first + key_type = k.class + else + if key_type && !@allow_duplicate_key && key_type != k.class + key_type = nil # stop checking + JSON.send(:on_mixed_keys_hash, obj, !@allow_duplicate_key.nil?) + end + buf << ',' + end key_str = k.to_s if key_str.class == String @@ -471,9 +495,18 @@ def json_transform(state) delim = ",#{state.object_nl}" result = +"{#{state.object_nl}" first = true + key_type = nil indent = !state.object_nl.empty? each { |key, value| - result << delim unless first + if first + key_type = key.class + else + if key_type && !state.allow_duplicate_key? && key_type != key.class + key_type = nil # stop checking + JSON.send(:on_mixed_keys_hash, self, state.allow_duplicate_key? == false) + end + result << delim + end result << state.indent * depth if indent if state.strict? && !(Symbol === key || String === key) diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 963350ea4..4315d109d 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -234,6 +234,24 @@ def test_state_defaults :space => "", :space_before => "", }.sort_by { |n,| n.to_s }, state.to_h.sort_by { |n,| n.to_s }) + + state = JSON::State.new(allow_duplicate_key: true) + assert_equal({ + :allow_duplicate_key => true, + :allow_nan => false, + :array_nl => "", + :as_json => false, + :ascii_only => false, + :buffer_initial_length => 1024, + :depth => 0, + :script_safe => false, + :strict => false, + :indent => "", + :max_nesting => 100, + :object_nl => "", + :space => "", + :space_before => "", + }.sort_by { |n,| n.to_s }, state.to_h.sort_by { |n,| n.to_s }) end def test_allow_nan @@ -828,4 +846,24 @@ def test_numbers_of_various_sizes assert_equal "[#{number}]", JSON.generate([number]) end end + + def test_generate_duplicate_keys_allowed + hash = { foo: 1, "foo" => 2 } + assert_equal %({"foo":1,"foo":2}), JSON.generate(hash, allow_duplicate_key: true) + end + + def test_generate_duplicate_keys_deprecated + hash = { foo: 1, "foo" => 2 } + assert_deprecated_warning(/allow_duplicate_key/) do + assert_equal %({"foo":1,"foo":2}), JSON.generate(hash) + end + end + + def test_generate_duplicate_keys_disallowed + hash = { foo: 1, "foo" => 2 } + error = assert_raise JSON::GeneratorError do + JSON.generate(hash, allow_duplicate_key: false) + end + assert_equal %(detected duplicate key "foo" in #{hash.inspect}), error.message + end end