From 5274d5d19f8ae20e1e31c4021a74c640badd3123 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Mon, 11 Aug 2025 22:02:12 -0500 Subject: [PATCH 1/8] Allow for segmented output streams and a SWAR-based basic StringEncoder implementation. --- Rakefile | 2 +- .../AbstractByteListDirectOutputStream.java | 31 +++++ .../json/ext/ByteListDirectOutputStream.java | 3 +- java/src/json/ext/Generator.java | 4 +- ...edSegmentedByteListDirectOutputStream.java | 106 +++++++++++++++ .../SegmentedByteListDirectOutputStream.java | 79 ++++++++++++ java/src/json/ext/StringEncoder.java | 121 +++++++++++++++++- 7 files changed, 338 insertions(+), 8 deletions(-) create mode 100644 java/src/json/ext/AbstractByteListDirectOutputStream.java create mode 100644 java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java create mode 100644 java/src/json/ext/SegmentedByteListDirectOutputStream.java diff --git a/Rakefile b/Rakefile index 52e178ed1..616e7e98e 100644 --- a/Rakefile +++ b/Rakefile @@ -116,7 +116,7 @@ if defined?(RUBY_ENGINE) and RUBY_ENGINE == 'jruby' file JRUBY_GENERATOR_JAR => :compile do cd 'java/src' do generator_classes = FileList[ - "json/ext/ByteList*.class", + "json/ext/*ByteList*.class", "json/ext/OptionsReader*.class", "json/ext/Generator*.class", "json/ext/RuntimeInfo*.class", diff --git a/java/src/json/ext/AbstractByteListDirectOutputStream.java b/java/src/json/ext/AbstractByteListDirectOutputStream.java new file mode 100644 index 000000000..64ca29d35 --- /dev/null +++ b/java/src/json/ext/AbstractByteListDirectOutputStream.java @@ -0,0 +1,31 @@ +package json.ext; + +import java.io.OutputStream; + +import org.jcodings.Encoding; +import org.jruby.util.ByteList; + +abstract class AbstractByteListDirectOutputStream extends OutputStream { + + private static final String PROP_SEGMENTED_BUFFER = "json.useSegmentedOutputStream"; + private static final String PROP_SEGMENTED_BUFFER_DEFAULT = "true"; + + private static final boolean USE_SEGMENTED_BUFFER; + + static { + String useSegmentedOutputStream = System.getProperty(PROP_SEGMENTED_BUFFER, PROP_SEGMENTED_BUFFER_DEFAULT); + USE_SEGMENTED_BUFFER = Boolean.parseBoolean(useSegmentedOutputStream); + // XXX Is there a logger we can use here? + // System.out.println("Using segmented output stream: " + USE_SEGMENTED_BUFFER); + } + + public static AbstractByteListDirectOutputStream create(int estimatedSize) { + if (USE_SEGMENTED_BUFFER) { + return new SegmentedByteListDirectOutputStream(estimatedSize); + } else { + return new ByteListDirectOutputStream(estimatedSize); + } + } + + public abstract ByteList toByteListDirect(Encoding encoding); +} diff --git a/java/src/json/ext/ByteListDirectOutputStream.java b/java/src/json/ext/ByteListDirectOutputStream.java index b22d48128..a92753ca3 100644 --- a/java/src/json/ext/ByteListDirectOutputStream.java +++ b/java/src/json/ext/ByteListDirectOutputStream.java @@ -4,10 +4,9 @@ import org.jruby.util.ByteList; import java.io.IOException; -import java.io.OutputStream; import java.util.Arrays; -public class ByteListDirectOutputStream extends OutputStream { +public class ByteListDirectOutputStream extends AbstractByteListDirectOutputStream { private byte[] buffer; private int length; diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java index 94436cc4a..5f5177bea 100644 --- a/java/src/json/ext/Generator.java +++ b/java/src/json/ext/Generator.java @@ -26,6 +26,8 @@ import org.jruby.util.IOOutputStream; import org.jruby.util.TypeConverter; +import json.ext.ByteListDirectOutputStream; + import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -252,7 +254,7 @@ int guessSize(ThreadContext context, Session session, T object) { } RubyString generateNew(ThreadContext context, Session session, T object) { - ByteListDirectOutputStream buffer = new ByteListDirectOutputStream(guessSize(context, session, object)); + AbstractByteListDirectOutputStream buffer = AbstractByteListDirectOutputStream.create(guessSize(context, session, object)); generateToBuffer(context, session, object, buffer); return RubyString.newString(context.runtime, buffer.toByteListDirect(UTF8Encoding.INSTANCE)); } diff --git a/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java b/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java new file mode 100644 index 000000000..83a1d89a3 --- /dev/null +++ b/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java @@ -0,0 +1,106 @@ +package json.ext; + +import org.jcodings.Encoding; +import org.jruby.util.ByteList; + +import java.io.IOException; + +public class LinkedSegmentedByteListDirectOutputStream extends AbstractByteListDirectOutputStream { + private Segment head; + private int length; + private Segment current; + private int numSegments; + + private static class Segment { + static final int DEFAULT_SEGMENT_SIZE = 1024; + byte[] buffer; + int length; + Segment next; + + Segment() { + this(DEFAULT_SEGMENT_SIZE); + } + + Segment(int size) { + if (size <= 0) { + size = DEFAULT_SEGMENT_SIZE; + } + buffer = new byte[Math.max(size, DEFAULT_SEGMENT_SIZE)]; + } + } + + LinkedSegmentedByteListDirectOutputStream() { + this(Segment.DEFAULT_SEGMENT_SIZE); + } + + LinkedSegmentedByteListDirectOutputStream(int size) { + current = head = new Segment(size); + } + + public ByteList toByteListDirect(Encoding encoding) { + byte[] buffer = new byte[length]; + Segment segment = head; + int pos = 0; + while (segment != null) { + System.arraycopy(segment.buffer, 0, buffer, pos, segment.length); + pos += segment.length; + segment = segment.next; + } + return new ByteList(buffer, 0, length, encoding, false); + } + + @Override + public void write(int b) throws IOException { + Segment c = current; + if (c.length == c.buffer.length) { + // This check is deliberately in the case the current segment is full. We want to + // avoid this check in the common case where we have space in the current segment. + if (this.length + 1 < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + if (c.next == null) { + numSegments++; + c.next = new Segment(c.buffer.length * 2); + } + c = c.next; + current = c; + } + c.buffer[c.length++] = (byte)b; + length++; + } + + @Override + public void write(byte[] bytes, int start, int length) throws IOException { + Segment c = current; + int remaining = length; + + while (remaining > 0) { + if (c.length == c.buffer.length) { + // This check is deliberately in the case the current segment is full. We want to + // avoid this check in the common case where we have space in the current segment. + if (this.length + remaining < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + if (c.next == null) { + numSegments++; + c.next = new Segment(c.buffer.length * 2); + } + c = c.next; + current = c; + } + int currentLength = c.length; + int currentCapacity = c.buffer.length; + int copyLength = Math.min(remaining, currentCapacity - currentLength); + System.arraycopy(bytes, start, c.buffer, currentLength, copyLength); + c.length += copyLength; + this.length += copyLength; + start += copyLength; + remaining -= copyLength; + } + } + + @Override + public void write(byte[] bytes) throws IOException { + write(bytes, 0, bytes.length); + } +} diff --git a/java/src/json/ext/SegmentedByteListDirectOutputStream.java b/java/src/json/ext/SegmentedByteListDirectOutputStream.java new file mode 100644 index 000000000..86d38cd15 --- /dev/null +++ b/java/src/json/ext/SegmentedByteListDirectOutputStream.java @@ -0,0 +1,79 @@ +package json.ext; + +import org.jcodings.Encoding; +import org.jruby.util.ByteList; + +import java.io.IOException; + +public class SegmentedByteListDirectOutputStream extends AbstractByteListDirectOutputStream { + private static final int DEFAULT_CAPACITY = 1024; + + private int totalLength; + private byte[][] segments = new byte[21][]; + private int currentSegmentIndex; + private int currentSegmentLength; + private byte[] currentSegment; + + SegmentedByteListDirectOutputStream(int size) { + currentSegment = new byte[Math.max(size, DEFAULT_CAPACITY)]; + segments[0] = currentSegment; + } + + public ByteList toByteListDirect(Encoding encoding) { + byte[] buffer = new byte[totalLength]; + int pos = 0; + // We handle the current segment separately. + for (int i = 0; i < currentSegmentIndex; i++) { + byte[] segment = segments[i]; + System.arraycopy(segment, 0, buffer, pos, segment.length); + pos += segment.length; + } + System.arraycopy(currentSegment, 0, buffer, pos, currentSegmentLength); + return new ByteList(buffer, 0, totalLength, encoding, false); + } + + @Override + public void write(int b) throws IOException { + if (currentSegmentLength == currentSegment.length) { + if (totalLength + 1 < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + currentSegmentIndex++; + int capacity = currentSegment.length * 2; + currentSegment = new byte[capacity]; + currentSegmentLength = 0; + segments[currentSegmentIndex] = currentSegment; + } + currentSegment[currentSegmentLength++] = (byte) b; + totalLength++; + } + + @Override + public void write(byte[] bytes, int start, int length) throws IOException { + int remaining = length; + + while (remaining > 0) { + if (currentSegmentLength == currentSegment.length) { + if (totalLength + remaining < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + currentSegmentIndex++; + int capacity = currentSegment.length * 2; + currentSegment = new byte[capacity]; + currentSegmentLength = 0; + segments[currentSegmentIndex] = currentSegment; + } + int toWrite = Math.min(remaining, currentSegment.length - currentSegmentLength); + System.arraycopy(bytes, start, currentSegment, currentSegmentLength, toWrite); + currentSegmentLength += toWrite; + start += toWrite; + remaining -= toWrite; + } + totalLength += length; + } + + @Override + public void write(byte[] bytes) throws IOException { + write(bytes, 0, bytes.length); + } +} diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index d178d0bd8..6783be9de 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -5,6 +5,11 @@ */ package json.ext; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + import org.jcodings.Encoding; import org.jcodings.specific.ASCIIEncoding; import org.jcodings.specific.USASCIIEncoding; @@ -17,10 +22,6 @@ import org.jruby.util.ByteList; import org.jruby.util.StringSupport; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.charset.StandardCharsets; - /** * An encoder that reads from the given source and outputs its representation * to another ByteList. The source string is fully checked for UTF-8 validity, @@ -114,6 +115,17 @@ class StringEncoder extends ByteListTranscoder { protected final byte[] escapeTable; + private static final String USE_SWAR_BASIC_ENCODER_PROP = "json.useSWARBasicEncoder"; + private static final String USE_SWAR_BASIC_ENCODER_DEFAULT = "true"; + private static final boolean USE_BASIC_SWAR_ENCODER; + + static { + USE_BASIC_SWAR_ENCODER = Boolean.parseBoolean( + System.getProperty(USE_SWAR_BASIC_ENCODER_PROP, USE_SWAR_BASIC_ENCODER_DEFAULT)); + // XXX Is there a logger we can use here? + // System.out.println("Using SWAR basic encoder: " + USE_BASIC_SWAR_ENCODER); + } + OutputStream out; // Escaped characters will reuse this array, to avoid new allocations @@ -198,8 +210,109 @@ private static RubyString tryWeirdEncodings(ThreadContext context, RubyString st return str; } + void encodeBasicSWAR(ByteList src) throws IOException { + byte[] hexdig = HEX; + byte[] scratch = aux; + + byte[] ptrBytes = src.unsafeBytes(); + int ptr = src.begin(); + int len = src.realSize(); + + int beg = 0; + int pos = 0; + + ByteBuffer bb = ByteBuffer.wrap(ptrBytes, 0, len); + while (pos + 8 <= len) { + long x = bb.getLong(ptr + pos); + long is_ascii = 0x8080808080808080L & ~x; + long xor2 = x ^ 0x0202020202020202L; + long lt32_or_eq34 = xor2 - 0x2121212121212121L; + long sub92 = x ^ 0x5C5C5C5C5C5C5C5CL; + long eq92 = (sub92 - 0x0101010101010101L); + boolean needs_escape = ((lt32_or_eq34 | eq92) & is_ascii) != 0; + if (needs_escape) { + // Find the exact byte that needs escaping + for (int i = 0; i < 8; i++) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos + i]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos + i, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + break; + } + } + continue; + } + + pos += 8; + } + + // Handle remaining bytes one by one + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + if (beg < len) { + append(ptrBytes, ptr + beg, len - beg); + } + } + + void encodeBasic(ByteList src) throws IOException{ + byte[] hexdig = HEX; + byte[] scratch = aux; + + byte[] ptrBytes = src.unsafeBytes(); + int ptr = src.begin(); + int len = src.realSize(); + + int beg = 0; + int pos = 0; + + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + if (beg < len) { + append(ptrBytes, ptr + beg, len - beg); + } + } + // C: convert_UTF8_to_JSON void encode(ByteList src) throws IOException { + if (escapeTable == ESCAPE_TABLE) { + if (USE_BASIC_SWAR_ENCODER) { + encodeBasicSWAR(src); + } else { + encodeBasic(src); + } + return; + } + byte[] hexdig = HEX; byte[] scratch = aux; byte[] escapeTable = this.escapeTable; From b2644fff8aff8986ba324317000609412b9578e5 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 13 Aug 2025 08:09:48 -0500 Subject: [PATCH 2/8] Handle the case were the capacity overflows Integer.MAX_VALUE. --- ...inkedSegmentedByteListDirectOutputStream.java | 16 ++++++++++------ .../ext/SegmentedByteListDirectOutputStream.java | 6 ++++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java b/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java index 83a1d89a3..125bb11ef 100644 --- a/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java +++ b/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java @@ -58,10 +58,12 @@ public void write(int b) throws IOException { if (this.length + 1 < 0) { throw new IOException("Total length exceeds maximum length of an array."); } - if (c.next == null) { - numSegments++; - c.next = new Segment(c.buffer.length * 2); + numSegments++; + int capacity = c.buffer.length * 2; + if (capacity < 0) { + capacity = Integer.MAX_VALUE - length; } + c.next = new Segment(capacity); c = c.next; current = c; } @@ -81,10 +83,12 @@ public void write(byte[] bytes, int start, int length) throws IOException { if (this.length + remaining < 0) { throw new IOException("Total length exceeds maximum length of an array."); } - if (c.next == null) { - numSegments++; - c.next = new Segment(c.buffer.length * 2); + numSegments++; + int capacity = c.buffer.length * 2; + if (capacity < 0) { + capacity = Integer.MAX_VALUE - length; } + c.next = new Segment(capacity); c = c.next; current = c; } diff --git a/java/src/json/ext/SegmentedByteListDirectOutputStream.java b/java/src/json/ext/SegmentedByteListDirectOutputStream.java index 86d38cd15..63112fe86 100644 --- a/java/src/json/ext/SegmentedByteListDirectOutputStream.java +++ b/java/src/json/ext/SegmentedByteListDirectOutputStream.java @@ -40,6 +40,9 @@ public void write(int b) throws IOException { } currentSegmentIndex++; int capacity = currentSegment.length * 2; + if (capacity < 0) { + capacity = Integer.MAX_VALUE - totalLength; + } currentSegment = new byte[capacity]; currentSegmentLength = 0; segments[currentSegmentIndex] = currentSegment; @@ -59,6 +62,9 @@ public void write(byte[] bytes, int start, int length) throws IOException { } currentSegmentIndex++; int capacity = currentSegment.length * 2; + if (capacity < 0) { + capacity = Integer.MAX_VALUE - totalLength; + } currentSegment = new byte[capacity]; currentSegmentLength = 0; segments[currentSegmentIndex] = currentSegment; From 32f328728cba8b9b6a969d8c9a6582e5c8ae7d62 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 13 Aug 2025 21:45:42 -0500 Subject: [PATCH 3/8] Remove the LinkedSegmentedByteListDirectOutputStream in favor of the SegmentedByteListDirectOutputStream. --- ...edSegmentedByteListDirectOutputStream.java | 110 ------------------ 1 file changed, 110 deletions(-) delete mode 100644 java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java diff --git a/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java b/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java deleted file mode 100644 index 125bb11ef..000000000 --- a/java/src/json/ext/LinkedSegmentedByteListDirectOutputStream.java +++ /dev/null @@ -1,110 +0,0 @@ -package json.ext; - -import org.jcodings.Encoding; -import org.jruby.util.ByteList; - -import java.io.IOException; - -public class LinkedSegmentedByteListDirectOutputStream extends AbstractByteListDirectOutputStream { - private Segment head; - private int length; - private Segment current; - private int numSegments; - - private static class Segment { - static final int DEFAULT_SEGMENT_SIZE = 1024; - byte[] buffer; - int length; - Segment next; - - Segment() { - this(DEFAULT_SEGMENT_SIZE); - } - - Segment(int size) { - if (size <= 0) { - size = DEFAULT_SEGMENT_SIZE; - } - buffer = new byte[Math.max(size, DEFAULT_SEGMENT_SIZE)]; - } - } - - LinkedSegmentedByteListDirectOutputStream() { - this(Segment.DEFAULT_SEGMENT_SIZE); - } - - LinkedSegmentedByteListDirectOutputStream(int size) { - current = head = new Segment(size); - } - - public ByteList toByteListDirect(Encoding encoding) { - byte[] buffer = new byte[length]; - Segment segment = head; - int pos = 0; - while (segment != null) { - System.arraycopy(segment.buffer, 0, buffer, pos, segment.length); - pos += segment.length; - segment = segment.next; - } - return new ByteList(buffer, 0, length, encoding, false); - } - - @Override - public void write(int b) throws IOException { - Segment c = current; - if (c.length == c.buffer.length) { - // This check is deliberately in the case the current segment is full. We want to - // avoid this check in the common case where we have space in the current segment. - if (this.length + 1 < 0) { - throw new IOException("Total length exceeds maximum length of an array."); - } - numSegments++; - int capacity = c.buffer.length * 2; - if (capacity < 0) { - capacity = Integer.MAX_VALUE - length; - } - c.next = new Segment(capacity); - c = c.next; - current = c; - } - c.buffer[c.length++] = (byte)b; - length++; - } - - @Override - public void write(byte[] bytes, int start, int length) throws IOException { - Segment c = current; - int remaining = length; - - while (remaining > 0) { - if (c.length == c.buffer.length) { - // This check is deliberately in the case the current segment is full. We want to - // avoid this check in the common case where we have space in the current segment. - if (this.length + remaining < 0) { - throw new IOException("Total length exceeds maximum length of an array."); - } - numSegments++; - int capacity = c.buffer.length * 2; - if (capacity < 0) { - capacity = Integer.MAX_VALUE - length; - } - c.next = new Segment(capacity); - c = c.next; - current = c; - } - int currentLength = c.length; - int currentCapacity = c.buffer.length; - int copyLength = Math.min(remaining, currentCapacity - currentLength); - System.arraycopy(bytes, start, c.buffer, currentLength, copyLength); - c.length += copyLength; - this.length += copyLength; - start += copyLength; - remaining -= copyLength; - } - } - - @Override - public void write(byte[] bytes) throws IOException { - write(bytes, 0, bytes.length); - } -} From 44b1d8774ad4e1433a691d0218ec029f7ea8f8bc Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 13 Aug 2025 22:20:12 -0500 Subject: [PATCH 4/8] Use a ternary to determine the capacity of the next segment when growing the output buffer. --- .../json/ext/SegmentedByteListDirectOutputStream.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/java/src/json/ext/SegmentedByteListDirectOutputStream.java b/java/src/json/ext/SegmentedByteListDirectOutputStream.java index 63112fe86..abad16612 100644 --- a/java/src/json/ext/SegmentedByteListDirectOutputStream.java +++ b/java/src/json/ext/SegmentedByteListDirectOutputStream.java @@ -40,9 +40,7 @@ public void write(int b) throws IOException { } currentSegmentIndex++; int capacity = currentSegment.length * 2; - if (capacity < 0) { - capacity = Integer.MAX_VALUE - totalLength; - } + capacity = (capacity < 0) ? DEFAULT_CAPACITY : capacity; currentSegment = new byte[capacity]; currentSegmentLength = 0; segments[currentSegmentIndex] = currentSegment; @@ -61,10 +59,9 @@ public void write(byte[] bytes, int start, int length) throws IOException { throw new IOException("Total length exceeds maximum length of an array."); } currentSegmentIndex++; - int capacity = currentSegment.length * 2; - if (capacity < 0) { - capacity = Integer.MAX_VALUE - totalLength; - } + int capacity = currentSegment.length << 1; + capacity = (capacity < 0) ? DEFAULT_CAPACITY : capacity; + capacity = (capacity < remaining) ? remaining : capacity; currentSegment = new byte[capacity]; currentSegmentLength = 0; segments[currentSegmentIndex] = currentSegment; From a458201a7e960ada025bab8efaa9e1c96eeb256a Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 14 Aug 2025 23:07:34 -0500 Subject: [PATCH 5/8] Use SWAR if there is still at least 4 bytes remaining. --- java/src/json/ext/StringEncoder.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index 6783be9de..5c19ea8a5 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -247,15 +247,16 @@ void encodeBasicSWAR(ByteList src) throws IOException { pos += 8; } - // Handle remaining bytes one by one - while (pos < len) { - int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); - int ch_len = ESCAPE_TABLE[ch]; - if (ch_len > 0) { - beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); - escapeAscii(ch, scratch, hexdig); - } else { - pos++; + if (pos + 4 <= len) { + int x = bb.getInt(ptr + pos); + int is_ascii = 0x808080 & ~x; + int xor2 = x ^ 0x020202; + int lt32_or_eq34 = xor2 - 0x212121; + int sub92 = x ^ 0x5C5C5C; + int eq92 = (sub92 - 0x010101); + boolean skip_chunk = ((lt32_or_eq34 | eq92) & is_ascii) == 0; + if (skip_chunk) { + pos += 4; } } From 9ebe1051441b937abfe965cd78d09c8f0ed5593f Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 15 Aug 2025 09:23:21 -0500 Subject: [PATCH 6/8] Ensure the SWAR encoder in the java extension checks every byte. --- java/src/json/ext/StringEncoder.java | 10 +++++----- test/json/json_generator_test.rb | 12 ++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index 5c19ea8a5..cca547b7b 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -249,11 +249,11 @@ void encodeBasicSWAR(ByteList src) throws IOException { if (pos + 4 <= len) { int x = bb.getInt(ptr + pos); - int is_ascii = 0x808080 & ~x; - int xor2 = x ^ 0x020202; - int lt32_or_eq34 = xor2 - 0x212121; - int sub92 = x ^ 0x5C5C5C; - int eq92 = (sub92 - 0x010101); + int is_ascii = 0x80808080 & ~x; + int xor2 = x ^ 0x02020202; + int lt32_or_eq34 = xor2 - 0x21212121; + int sub92 = x ^ 0x5C5C5C5C; + int eq92 = (sub92 - 0x01010101); boolean skip_chunk = ((lt32_or_eq34 | eq92) & is_ascii) == 0; if (skip_chunk) { pos += 4; diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 4315d109d..6b42de2ad 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -504,6 +504,18 @@ def test_backslash json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' assert_equal json, generate(data) # + data = '"""""' + json = '"\"\"\"\"\""' + assert_equal json, generate(data) + # + data = "abc\n" + json = '"abc\\n"' + assert_equal json, generate(data) + # + data = "\nabc" + json = '"\\nabc"' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) From 052198a7be9c399d4a416bdf1d2ba78f35b414d6 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 17 Aug 2025 20:31:56 -0500 Subject: [PATCH 7/8] Refactor the SWAR logic into a separate subclass of StringEncoder. --- Rakefile | 2 +- java/src/json/ext/Generator.java | 2 +- java/src/json/ext/SWARBasicStringEncoder.java | 84 +++++++++++++++++++ java/src/json/ext/StringEncoder.java | 83 +++--------------- 4 files changed, 96 insertions(+), 75 deletions(-) create mode 100644 java/src/json/ext/SWARBasicStringEncoder.java diff --git a/Rakefile b/Rakefile index 616e7e98e..57b1d0ec5 100644 --- a/Rakefile +++ b/Rakefile @@ -120,7 +120,7 @@ if defined?(RUBY_ENGINE) and RUBY_ENGINE == 'jruby' "json/ext/OptionsReader*.class", "json/ext/Generator*.class", "json/ext/RuntimeInfo*.class", - "json/ext/StringEncoder*.class", + "json/ext/*StringEncoder*.class", "json/ext/Utils*.class" ] sh 'jar', 'cf', File.basename(JRUBY_GENERATOR_JAR), *generator_classes diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java index 5f5177bea..c8452ade6 100644 --- a/java/src/json/ext/Generator.java +++ b/java/src/json/ext/Generator.java @@ -234,7 +234,7 @@ public StringEncoder getStringEncoder(ThreadContext context) { GeneratorState state = getState(context); stringEncoder = state.asciiOnly() ? new StringEncoderAsciiOnly(state.scriptSafe()) : - new StringEncoder(state.scriptSafe()); + (state.scriptSafe()) ? new StringEncoder(state.scriptSafe()) : StringEncoder.createBasicEncoder(); } return stringEncoder; } diff --git a/java/src/json/ext/SWARBasicStringEncoder.java b/java/src/json/ext/SWARBasicStringEncoder.java new file mode 100644 index 000000000..9d6d4f2b9 --- /dev/null +++ b/java/src/json/ext/SWARBasicStringEncoder.java @@ -0,0 +1,84 @@ +package json.ext; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.jruby.util.ByteList; + +public class SWARBasicStringEncoder extends StringEncoder { + + public SWARBasicStringEncoder() { + super(ESCAPE_TABLE); + } + + @Override + void encode(ByteList src) throws IOException { + byte[] hexdig = HEX; + byte[] scratch = aux; + + byte[] ptrBytes = src.unsafeBytes(); + int ptr = src.begin(); + int len = src.realSize(); + + int beg = 0; + int pos = 0; + + ByteBuffer bb = ByteBuffer.wrap(ptrBytes, 0, len); + while (pos + 8 <= len) { + long x = bb.getLong(ptr + pos); + if (skipChunk(x)) { + pos += 8; + continue; + } + for (int i = 0; i < 8; i++) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos + i]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos + i, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + break; + } + } + } + + if (pos + 4 <= len) { + int x = bb.getInt(ptr + pos); + if (skipChunk(x)) { + pos += 4; + } + } + + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + if (beg < len) { + append(ptrBytes, ptr + beg, len - beg); + } + } + + private boolean skipChunk(long x) { + long is_ascii = 0x8080808080808080L & ~x; + long xor2 = x ^ 0x0202020202020202L; + long lt32_or_eq34 = xor2 - 0x2121212121212121L; + long sub92 = x ^ 0x5C5C5C5C5C5C5C5CL; + long eq92 = (sub92 - 0x0101010101010101L); + return ((lt32_or_eq34 | eq92) & is_ascii) == 0; + } + + private boolean skipChunk(int x) { + int is_ascii = 0x80808080 & ~x; + int xor2 = x ^ 0x02020202; + int lt32_or_eq34 = xor2 - 0x21212121; + int sub92 = x ^ 0x5C5C5C5C; + int eq92 = (sub92 - 0x01010101); + return ((lt32_or_eq34 | eq92) & is_ascii) == 0; + } +} diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index cca547b7b..047b829ff 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.io.OutputStream; -import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import org.jcodings.Encoding; @@ -150,6 +149,14 @@ class StringEncoder extends ByteListTranscoder { this.escapeTable = escapeTable; } + static StringEncoder createBasicEncoder() { + if (USE_BASIC_SWAR_ENCODER) { + return new SWARBasicStringEncoder(); + } else { + return new StringEncoder(false); + } + } + // C: generate_json_string void generate(ThreadContext context, RubyString object, OutputStream buffer) throws IOException { object = ensureValidEncoding(context, object); @@ -210,73 +217,7 @@ private static RubyString tryWeirdEncodings(ThreadContext context, RubyString st return str; } - void encodeBasicSWAR(ByteList src) throws IOException { - byte[] hexdig = HEX; - byte[] scratch = aux; - - byte[] ptrBytes = src.unsafeBytes(); - int ptr = src.begin(); - int len = src.realSize(); - - int beg = 0; - int pos = 0; - - ByteBuffer bb = ByteBuffer.wrap(ptrBytes, 0, len); - while (pos + 8 <= len) { - long x = bb.getLong(ptr + pos); - long is_ascii = 0x8080808080808080L & ~x; - long xor2 = x ^ 0x0202020202020202L; - long lt32_or_eq34 = xor2 - 0x2121212121212121L; - long sub92 = x ^ 0x5C5C5C5C5C5C5C5CL; - long eq92 = (sub92 - 0x0101010101010101L); - boolean needs_escape = ((lt32_or_eq34 | eq92) & is_ascii) != 0; - if (needs_escape) { - // Find the exact byte that needs escaping - for (int i = 0; i < 8; i++) { - int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos + i]); - int ch_len = ESCAPE_TABLE[ch]; - if (ch_len > 0) { - beg = pos = flushPos(pos + i, beg, ptrBytes, ptr, 1); - escapeAscii(ch, scratch, hexdig); - break; - } - } - continue; - } - - pos += 8; - } - - if (pos + 4 <= len) { - int x = bb.getInt(ptr + pos); - int is_ascii = 0x80808080 & ~x; - int xor2 = x ^ 0x02020202; - int lt32_or_eq34 = xor2 - 0x21212121; - int sub92 = x ^ 0x5C5C5C5C; - int eq92 = (sub92 - 0x01010101); - boolean skip_chunk = ((lt32_or_eq34 | eq92) & is_ascii) == 0; - if (skip_chunk) { - pos += 4; - } - } - - while (pos < len) { - int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); - int ch_len = ESCAPE_TABLE[ch]; - if (ch_len > 0) { - beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); - escapeAscii(ch, scratch, hexdig); - } else { - pos++; - } - } - - if (beg < len) { - append(ptrBytes, ptr + beg, len - beg); - } - } - - void encodeBasic(ByteList src) throws IOException{ + void encodeBasic(ByteList src) throws IOException { byte[] hexdig = HEX; byte[] scratch = aux; @@ -306,11 +247,7 @@ void encodeBasic(ByteList src) throws IOException{ // C: convert_UTF8_to_JSON void encode(ByteList src) throws IOException { if (escapeTable == ESCAPE_TABLE) { - if (USE_BASIC_SWAR_ENCODER) { - encodeBasicSWAR(src); - } else { - encodeBasic(src); - } + encodeBasic(src); return; } From 43a8a8345681586fdc016572cf3f6f38a12977ec Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 17 Aug 2025 20:37:22 -0500 Subject: [PATCH 8/8] Refactor the logic to evaluate every byte in the chunk if there is a byte in that chunk that needs escaping. --- .../json/ext/AbstractByteListDirectOutputStream.java | 2 +- java/src/json/ext/SWARBasicStringEncoder.java | 10 ++++++---- .../json/ext/SegmentedByteListDirectOutputStream.java | 2 ++ java/src/json/ext/StringEncoder.java | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/java/src/json/ext/AbstractByteListDirectOutputStream.java b/java/src/json/ext/AbstractByteListDirectOutputStream.java index 64ca29d35..c3175c849 100644 --- a/java/src/json/ext/AbstractByteListDirectOutputStream.java +++ b/java/src/json/ext/AbstractByteListDirectOutputStream.java @@ -7,7 +7,7 @@ abstract class AbstractByteListDirectOutputStream extends OutputStream { - private static final String PROP_SEGMENTED_BUFFER = "json.useSegmentedOutputStream"; + private static final String PROP_SEGMENTED_BUFFER = "jruby.json.useSegmentedOutputStream"; private static final String PROP_SEGMENTED_BUFFER_DEFAULT = "true"; private static final boolean USE_SEGMENTED_BUFFER; diff --git a/java/src/json/ext/SWARBasicStringEncoder.java b/java/src/json/ext/SWARBasicStringEncoder.java index 9d6d4f2b9..8021d73b8 100644 --- a/java/src/json/ext/SWARBasicStringEncoder.java +++ b/java/src/json/ext/SWARBasicStringEncoder.java @@ -30,13 +30,15 @@ void encode(ByteList src) throws IOException { pos += 8; continue; } - for (int i = 0; i < 8; i++) { - int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos + i]); + int chunkEnd = pos + 8; + while (pos < chunkEnd) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); int ch_len = ESCAPE_TABLE[ch]; if (ch_len > 0) { - beg = pos = flushPos(pos + i, beg, ptrBytes, ptr, 1); + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); escapeAscii(ch, scratch, hexdig); - break; + } else { + pos++; } } } diff --git a/java/src/json/ext/SegmentedByteListDirectOutputStream.java b/java/src/json/ext/SegmentedByteListDirectOutputStream.java index abad16612..0e89e50df 100644 --- a/java/src/json/ext/SegmentedByteListDirectOutputStream.java +++ b/java/src/json/ext/SegmentedByteListDirectOutputStream.java @@ -9,6 +9,8 @@ public class SegmentedByteListDirectOutputStream extends AbstractByteListDirectO private static final int DEFAULT_CAPACITY = 1024; private int totalLength; + // Why 21? The minimum segment size is 1024 bytes. If we double the segment size each time + // we need a new segment, we only need 21 segments to reach the maximum array size in Java. private byte[][] segments = new byte[21][]; private int currentSegmentIndex; private int currentSegmentLength; diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index 047b829ff..7f75476d9 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -114,7 +114,7 @@ class StringEncoder extends ByteListTranscoder { protected final byte[] escapeTable; - private static final String USE_SWAR_BASIC_ENCODER_PROP = "json.useSWARBasicEncoder"; + private static final String USE_SWAR_BASIC_ENCODER_PROP = "jruby.json.useSWARBasicEncoder"; private static final String USE_SWAR_BASIC_ENCODER_DEFAULT = "true"; private static final boolean USE_BASIC_SWAR_ENCODER;