diff --git a/Rakefile b/Rakefile index 52e178ed..57b1d0ec 100644 --- a/Rakefile +++ b/Rakefile @@ -116,11 +116,11 @@ if defined?(RUBY_ENGINE) and RUBY_ENGINE == 'jruby' file JRUBY_GENERATOR_JAR => :compile do cd 'java/src' do generator_classes = FileList[ - "json/ext/ByteList*.class", + "json/ext/*ByteList*.class", "json/ext/OptionsReader*.class", "json/ext/Generator*.class", "json/ext/RuntimeInfo*.class", - "json/ext/StringEncoder*.class", + "json/ext/*StringEncoder*.class", "json/ext/Utils*.class" ] sh 'jar', 'cf', File.basename(JRUBY_GENERATOR_JAR), *generator_classes diff --git a/java/src/json/ext/AbstractByteListDirectOutputStream.java b/java/src/json/ext/AbstractByteListDirectOutputStream.java new file mode 100644 index 00000000..c3175c84 --- /dev/null +++ b/java/src/json/ext/AbstractByteListDirectOutputStream.java @@ -0,0 +1,31 @@ +package json.ext; + +import java.io.OutputStream; + +import org.jcodings.Encoding; +import org.jruby.util.ByteList; + +abstract class AbstractByteListDirectOutputStream extends OutputStream { + + private static final String PROP_SEGMENTED_BUFFER = "jruby.json.useSegmentedOutputStream"; + private static final String PROP_SEGMENTED_BUFFER_DEFAULT = "true"; + + private static final boolean USE_SEGMENTED_BUFFER; + + static { + String useSegmentedOutputStream = System.getProperty(PROP_SEGMENTED_BUFFER, PROP_SEGMENTED_BUFFER_DEFAULT); + USE_SEGMENTED_BUFFER = Boolean.parseBoolean(useSegmentedOutputStream); + // XXX Is there a logger we can use here? + // System.out.println("Using segmented output stream: " + USE_SEGMENTED_BUFFER); + } + + public static AbstractByteListDirectOutputStream create(int estimatedSize) { + if (USE_SEGMENTED_BUFFER) { + return new SegmentedByteListDirectOutputStream(estimatedSize); + } else { + return new ByteListDirectOutputStream(estimatedSize); + } + } + + public abstract ByteList toByteListDirect(Encoding encoding); +} diff --git a/java/src/json/ext/ByteListDirectOutputStream.java b/java/src/json/ext/ByteListDirectOutputStream.java index b22d4812..a92753ca 100644 --- a/java/src/json/ext/ByteListDirectOutputStream.java +++ b/java/src/json/ext/ByteListDirectOutputStream.java @@ -4,10 +4,9 @@ import org.jruby.util.ByteList; import java.io.IOException; -import java.io.OutputStream; import java.util.Arrays; -public class ByteListDirectOutputStream extends OutputStream { +public class ByteListDirectOutputStream extends AbstractByteListDirectOutputStream { private byte[] buffer; private int length; diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java index 94436cc4..c8452ade 100644 --- a/java/src/json/ext/Generator.java +++ b/java/src/json/ext/Generator.java @@ -26,6 +26,8 @@ import org.jruby.util.IOOutputStream; import org.jruby.util.TypeConverter; +import json.ext.ByteListDirectOutputStream; + import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -232,7 +234,7 @@ public StringEncoder getStringEncoder(ThreadContext context) { GeneratorState state = getState(context); stringEncoder = state.asciiOnly() ? new StringEncoderAsciiOnly(state.scriptSafe()) : - new StringEncoder(state.scriptSafe()); + (state.scriptSafe()) ? new StringEncoder(state.scriptSafe()) : StringEncoder.createBasicEncoder(); } return stringEncoder; } @@ -252,7 +254,7 @@ int guessSize(ThreadContext context, Session session, T object) { } RubyString generateNew(ThreadContext context, Session session, T object) { - ByteListDirectOutputStream buffer = new ByteListDirectOutputStream(guessSize(context, session, object)); + AbstractByteListDirectOutputStream buffer = AbstractByteListDirectOutputStream.create(guessSize(context, session, object)); generateToBuffer(context, session, object, buffer); return RubyString.newString(context.runtime, buffer.toByteListDirect(UTF8Encoding.INSTANCE)); } diff --git a/java/src/json/ext/SWARBasicStringEncoder.java b/java/src/json/ext/SWARBasicStringEncoder.java new file mode 100644 index 00000000..8021d73b --- /dev/null +++ b/java/src/json/ext/SWARBasicStringEncoder.java @@ -0,0 +1,86 @@ +package json.ext; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.jruby.util.ByteList; + +public class SWARBasicStringEncoder extends StringEncoder { + + public SWARBasicStringEncoder() { + super(ESCAPE_TABLE); + } + + @Override + void encode(ByteList src) throws IOException { + byte[] hexdig = HEX; + byte[] scratch = aux; + + byte[] ptrBytes = src.unsafeBytes(); + int ptr = src.begin(); + int len = src.realSize(); + + int beg = 0; + int pos = 0; + + ByteBuffer bb = ByteBuffer.wrap(ptrBytes, 0, len); + while (pos + 8 <= len) { + long x = bb.getLong(ptr + pos); + if (skipChunk(x)) { + pos += 8; + continue; + } + int chunkEnd = pos + 8; + while (pos < chunkEnd) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + } + + if (pos + 4 <= len) { + int x = bb.getInt(ptr + pos); + if (skipChunk(x)) { + pos += 4; + } + } + + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + if (beg < len) { + append(ptrBytes, ptr + beg, len - beg); + } + } + + private boolean skipChunk(long x) { + long is_ascii = 0x8080808080808080L & ~x; + long xor2 = x ^ 0x0202020202020202L; + long lt32_or_eq34 = xor2 - 0x2121212121212121L; + long sub92 = x ^ 0x5C5C5C5C5C5C5C5CL; + long eq92 = (sub92 - 0x0101010101010101L); + return ((lt32_or_eq34 | eq92) & is_ascii) == 0; + } + + private boolean skipChunk(int x) { + int is_ascii = 0x80808080 & ~x; + int xor2 = x ^ 0x02020202; + int lt32_or_eq34 = xor2 - 0x21212121; + int sub92 = x ^ 0x5C5C5C5C; + int eq92 = (sub92 - 0x01010101); + return ((lt32_or_eq34 | eq92) & is_ascii) == 0; + } +} diff --git a/java/src/json/ext/SegmentedByteListDirectOutputStream.java b/java/src/json/ext/SegmentedByteListDirectOutputStream.java new file mode 100644 index 00000000..0e89e50d --- /dev/null +++ b/java/src/json/ext/SegmentedByteListDirectOutputStream.java @@ -0,0 +1,84 @@ +package json.ext; + +import org.jcodings.Encoding; +import org.jruby.util.ByteList; + +import java.io.IOException; + +public class SegmentedByteListDirectOutputStream extends AbstractByteListDirectOutputStream { + private static final int DEFAULT_CAPACITY = 1024; + + private int totalLength; + // Why 21? The minimum segment size is 1024 bytes. If we double the segment size each time + // we need a new segment, we only need 21 segments to reach the maximum array size in Java. + private byte[][] segments = new byte[21][]; + private int currentSegmentIndex; + private int currentSegmentLength; + private byte[] currentSegment; + + SegmentedByteListDirectOutputStream(int size) { + currentSegment = new byte[Math.max(size, DEFAULT_CAPACITY)]; + segments[0] = currentSegment; + } + + public ByteList toByteListDirect(Encoding encoding) { + byte[] buffer = new byte[totalLength]; + int pos = 0; + // We handle the current segment separately. + for (int i = 0; i < currentSegmentIndex; i++) { + byte[] segment = segments[i]; + System.arraycopy(segment, 0, buffer, pos, segment.length); + pos += segment.length; + } + System.arraycopy(currentSegment, 0, buffer, pos, currentSegmentLength); + return new ByteList(buffer, 0, totalLength, encoding, false); + } + + @Override + public void write(int b) throws IOException { + if (currentSegmentLength == currentSegment.length) { + if (totalLength + 1 < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + currentSegmentIndex++; + int capacity = currentSegment.length * 2; + capacity = (capacity < 0) ? DEFAULT_CAPACITY : capacity; + currentSegment = new byte[capacity]; + currentSegmentLength = 0; + segments[currentSegmentIndex] = currentSegment; + } + currentSegment[currentSegmentLength++] = (byte) b; + totalLength++; + } + + @Override + public void write(byte[] bytes, int start, int length) throws IOException { + int remaining = length; + + while (remaining > 0) { + if (currentSegmentLength == currentSegment.length) { + if (totalLength + remaining < 0) { + throw new IOException("Total length exceeds maximum length of an array."); + } + currentSegmentIndex++; + int capacity = currentSegment.length << 1; + capacity = (capacity < 0) ? DEFAULT_CAPACITY : capacity; + capacity = (capacity < remaining) ? remaining : capacity; + currentSegment = new byte[capacity]; + currentSegmentLength = 0; + segments[currentSegmentIndex] = currentSegment; + } + int toWrite = Math.min(remaining, currentSegment.length - currentSegmentLength); + System.arraycopy(bytes, start, currentSegment, currentSegmentLength, toWrite); + currentSegmentLength += toWrite; + start += toWrite; + remaining -= toWrite; + } + totalLength += length; + } + + @Override + public void write(byte[] bytes) throws IOException { + write(bytes, 0, bytes.length); + } +} diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index d178d0bd..7f75476d 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -5,6 +5,10 @@ */ package json.ext; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; + import org.jcodings.Encoding; import org.jcodings.specific.ASCIIEncoding; import org.jcodings.specific.USASCIIEncoding; @@ -17,10 +21,6 @@ import org.jruby.util.ByteList; import org.jruby.util.StringSupport; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.charset.StandardCharsets; - /** * An encoder that reads from the given source and outputs its representation * to another ByteList. The source string is fully checked for UTF-8 validity, @@ -114,6 +114,17 @@ class StringEncoder extends ByteListTranscoder { protected final byte[] escapeTable; + private static final String USE_SWAR_BASIC_ENCODER_PROP = "jruby.json.useSWARBasicEncoder"; + private static final String USE_SWAR_BASIC_ENCODER_DEFAULT = "true"; + private static final boolean USE_BASIC_SWAR_ENCODER; + + static { + USE_BASIC_SWAR_ENCODER = Boolean.parseBoolean( + System.getProperty(USE_SWAR_BASIC_ENCODER_PROP, USE_SWAR_BASIC_ENCODER_DEFAULT)); + // XXX Is there a logger we can use here? + // System.out.println("Using SWAR basic encoder: " + USE_BASIC_SWAR_ENCODER); + } + OutputStream out; // Escaped characters will reuse this array, to avoid new allocations @@ -138,6 +149,14 @@ class StringEncoder extends ByteListTranscoder { this.escapeTable = escapeTable; } + static StringEncoder createBasicEncoder() { + if (USE_BASIC_SWAR_ENCODER) { + return new SWARBasicStringEncoder(); + } else { + return new StringEncoder(false); + } + } + // C: generate_json_string void generate(ThreadContext context, RubyString object, OutputStream buffer) throws IOException { object = ensureValidEncoding(context, object); @@ -198,8 +217,40 @@ private static RubyString tryWeirdEncodings(ThreadContext context, RubyString st return str; } + void encodeBasic(ByteList src) throws IOException { + byte[] hexdig = HEX; + byte[] scratch = aux; + + byte[] ptrBytes = src.unsafeBytes(); + int ptr = src.begin(); + int len = src.realSize(); + + int beg = 0; + int pos = 0; + + while (pos < len) { + int ch = Byte.toUnsignedInt(ptrBytes[ptr + pos]); + int ch_len = ESCAPE_TABLE[ch]; + if (ch_len > 0) { + beg = pos = flushPos(pos, beg, ptrBytes, ptr, 1); + escapeAscii(ch, scratch, hexdig); + } else { + pos++; + } + } + + if (beg < len) { + append(ptrBytes, ptr + beg, len - beg); + } + } + // C: convert_UTF8_to_JSON void encode(ByteList src) throws IOException { + if (escapeTable == ESCAPE_TABLE) { + encodeBasic(src); + return; + } + byte[] hexdig = HEX; byte[] scratch = aux; byte[] escapeTable = this.escapeTable; diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 4315d109..6b42de2a 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -504,6 +504,18 @@ def test_backslash json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' assert_equal json, generate(data) # + data = '"""""' + json = '"\"\"\"\"\""' + assert_equal json, generate(data) + # + data = "abc\n" + json = '"abc\\n"' + assert_equal json, generate(data) + # + data = "\nabc" + json = '"\\nabc"' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data)