From bd64ed6d2a87b136f68f322e4db413aa37a5eabc Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Thu, 16 May 2019 11:10:27 +1000 Subject: [PATCH] SOLR-13437: fork noggit code into Solr (#666) * SOLR-13437: fork noggit code into Solr --- lucene/ivy-versions.properties | 1 - solr/CHANGES.txt | 2 + solr/NOTICE.txt | 15 +- .../schema/TestSchemalessBufferedUpdates.java | 2 +- solr/solrj/ivy.xml | 1 - .../org/apache/solr/common/cloud/Replica.java | 4 +- .../apache/solr/common/cloud/RoutingRule.java | 4 +- .../apache/solr/common/util/ByteUtils.java | 1 + solr/solrj/src/java/org/noggit/CharArr.java | 394 +++++ .../solrj/src/java/org/noggit/JSONParser.java | 1297 +++++++++++++++++ solr/solrj/src/java/org/noggit/JSONUtil.java | 203 +++ .../solrj/src/java/org/noggit/JSONWriter.java | 358 +++++ .../src/java/org/noggit/ObjectBuilder.java | 168 +++ .../src/test/org/noggit/TestJSONParser.java | 690 +++++++++ .../src/test/org/noggit/TestJSONWriter.java | 94 ++ .../test/org/noggit/TestObjectBuilder.java | 99 ++ 16 files changed, 3325 insertions(+), 8 deletions(-) create mode 100644 solr/solrj/src/java/org/noggit/CharArr.java create mode 100644 solr/solrj/src/java/org/noggit/JSONParser.java create mode 100644 solr/solrj/src/java/org/noggit/JSONUtil.java create mode 100644 solr/solrj/src/java/org/noggit/JSONWriter.java create mode 100644 solr/solrj/src/java/org/noggit/ObjectBuilder.java create mode 100644 solr/solrj/src/test/org/noggit/TestJSONParser.java create mode 100644 solr/solrj/src/test/org/noggit/TestJSONWriter.java create mode 100644 solr/solrj/src/test/org/noggit/TestObjectBuilder.java diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 98c94d729cb..7767b7c86af 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -274,7 +274,6 @@ org.gagravarr.vorbis.java.version = 0.8 /org.mockito/mockito-core = 2.23.4 -/org.noggit/noggit = 0.8 /org.objenesis/objenesis = 2.6 org.ow2.asm.version = 5.1 diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index cb33ebaadf8..161e7efcc3a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -94,6 +94,8 @@ Other Changes * SOLR-13453: Adjust auth metrics asserts in tests caused by SOLR-13449 (janhoy) +* SOLR-13437: noggit json parser is forked into solrj (noble) + ================== 8.1.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt index daeddbbbbd5..760d650b9de 100644 --- a/solr/NOTICE.txt +++ b/solr/NOTICE.txt @@ -590,4 +590,17 @@ The Base64 implementation in this software was derived from the Apache Commons Codec project. http://commons.apache.org/proper/commons-codec/ JSON processing in this software was derived from the JSON.simple toolkit. -https://code.google.com/p/json-simple/ \ No newline at end of file +https://code.google.com/p/json-simple/ + +========================================================================= +== noggit notice == +========================================================================= + +noggit + +Copyright 2006- Yonik Seeley + +Noggit is a fast streaming JSON parser for java. The code is included +into Solr codebase. + +https://github.com/yonik/noggit diff --git a/solr/core/src/test/org/apache/solr/schema/TestSchemalessBufferedUpdates.java b/solr/core/src/test/org/apache/solr/schema/TestSchemalessBufferedUpdates.java index eb1031caaa3..96ee4c6188c 100644 --- a/solr/core/src/test/org/apache/solr/schema/TestSchemalessBufferedUpdates.java +++ b/solr/core/src/test/org/apache/solr/schema/TestSchemalessBufferedUpdates.java @@ -145,7 +145,7 @@ public class TestSchemalessBufferedUpdates extends SolrTestCaseJ4 { UpdateRequestProcessor processor = chainUpToDUP.createProcessor(req, rsp); processor.processAdd(cmd); if (cmd.solrDoc.get("f_dt").getValue() instanceof Date) { - // Non-JSON types (Date in this case) aren't handled properly in noggit-0.6. Although this is fixed in + // Non-JSON types (Date in this case) aren't handled properly in noggit-0.6. Although this is fixed in // https://github.com/yonik/noggit/commit/ec3e732af7c9425e8f40297463cbe294154682b1 to call obj.toString(), // Date::toString produces a Date representation that Solr doesn't like, so we convert using Instant::toString cmd.solrDoc.get("f_dt").setValue(((Date) cmd.solrDoc.get("f_dt").getValue()).toInstant().toString()); diff --git a/solr/solrj/ivy.xml b/solr/solrj/ivy.xml index 85db8ce5dcb..c158510123f 100644 --- a/solr/solrj/ivy.xml +++ b/solr/solrj/ivy.xml @@ -36,7 +36,6 @@ - diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java index d73282bb414..e3b6d5dd184 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java @@ -20,7 +20,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import org.noggit.JSONUtil; +import org.apache.solr.common.util.Utils; public class Replica extends ZkNodeProps { @@ -183,6 +183,6 @@ public class Replica extends ZkNodeProps { @Override public String toString() { - return name + ':' + JSONUtil.toJSON(propMap, -1); // small enough, keep it on one line (i.e. no indent) + return name + ':' + Utils.toJSONString(propMap); // small enough, keep it on one line (i.e. no indent) } } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/RoutingRule.java b/solr/solrj/src/java/org/apache/solr/common/cloud/RoutingRule.java index 503c39eb105..5e8e050a24b 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/RoutingRule.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/RoutingRule.java @@ -21,7 +21,7 @@ import java.util.List; import java.util.Map; import org.apache.solr.common.util.SuppressForbidden; -import org.noggit.JSONUtil; +import org.apache.solr.common.util.Utils; /** * Used for routing docs with particular keys into another collection @@ -72,6 +72,6 @@ public class RoutingRule extends ZkNodeProps { @Override public String toString() { - return JSONUtil.toJSON(propMap, -1); + return Utils.toJSONString(propMap); } } diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java b/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java index ec4acdb6871..be64e6cac8e 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java @@ -21,6 +21,7 @@ import java.io.OutputStream; import org.noggit.CharArr; + public class ByteUtils { /** Maximum number of UTF8 bytes per UTF16 character. */ diff --git a/solr/solrj/src/java/org/noggit/CharArr.java b/solr/solrj/src/java/org/noggit/CharArr.java new file mode 100644 index 00000000000..9ecc8e62609 --- /dev/null +++ b/solr/solrj/src/java/org/noggit/CharArr.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + + +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.nio.CharBuffer; + +public class CharArr implements CharSequence, Appendable { + protected char[] buf; + protected int start; + protected int end; + + public CharArr() { + this(32); + } + + public CharArr(int size) { + buf = new char[size]; + } + + public CharArr(char[] arr, int start, int end) { + set(arr, start, end); + } + + public void setStart(int start) { + this.start = start; + } + + public void setEnd(int end) { + this.end = end; + } + + public void set(char[] arr, int start, int end) { + this.buf = arr; + this.start = start; + this.end = end; + } + + public char[] getArray() { + return buf; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + public int size() { + return end - start; + } + + @Override + public int length() { + return size(); + } + + /** + * The capacity of the buffer when empty (getArray().size()) + */ + public int capacity() { + return buf.length; + } + + + @Override + public char charAt(int index) { + return buf[start + index]; + } + + @Override + public CharArr subSequence(int start, int end) { + return new CharArr(buf, this.start + start, this.start + end); + } + + public int read() throws IOException { + if (start >= end) return -1; + return buf[start++]; + } + + public int read(char cbuf[], int off, int len) { + //TODO + return 0; + } + + public void unsafeWrite(char b) { + buf[end++] = b; + } + + public void unsafeWrite(int b) { + unsafeWrite((char) b); + } + + public void unsafeWrite(char b[], int off, int len) { + System.arraycopy(b, off, buf, end, len); + end += len; + } + + protected void resize(int len) { + char newbuf[] = new char[Math.max(buf.length << 1, len)]; + System.arraycopy(buf, start, newbuf, 0, size()); + buf = newbuf; + } + + public void reserve(int num) { + if (end + num > buf.length) resize(end + num); + } + + public void write(char b) { + if (end >= buf.length) { + resize(end + 1); + } + unsafeWrite(b); + } + + public final void write(int b) { + write((char) b); + } + + public final void write(char[] b) { + write(b, 0, b.length); + } + + public void write(char b[], int off, int len) { + reserve(len); + unsafeWrite(b, off, len); + } + + public final void write(CharArr arr) { + write(arr.buf, arr.start, arr.end - arr.start); + } + + public final void write(String s) { + write(s, 0, s.length()); + } + + public void write(String s, int stringOffset, int len) { + reserve(len); + s.getChars(stringOffset, len, buf, end); + end += len; + } + + public void flush() { + } + + public final void reset() { + start = end = 0; + } + + public void close() { + } + + public char[] toCharArray() { + char newbuf[] = new char[size()]; + System.arraycopy(buf, start, newbuf, 0, size()); + return newbuf; + } + + + @Override + public String toString() { + return new String(buf, start, size()); + } + + public int read(CharBuffer cb) throws IOException { + + /*** + int sz = size(); + if (sz<=0) return -1; + if (sz>0) cb.put(buf, start, sz); + return -1; + ***/ + + int sz = size(); + if (sz > 0) cb.put(buf, start, sz); + start = end; + while (true) { + fill(); + int s = size(); + if (s == 0) return sz == 0 ? -1 : sz; + sz += s; + cb.put(buf, start, s); + } + } + + + public int fill() throws IOException { + return 0; // or -1? + } + + //////////////// Appendable methods ///////////// + @Override + public final Appendable append(CharSequence csq) throws IOException { + return append(csq, 0, csq.length()); + } + + @Override + public Appendable append(CharSequence csq, int start, int end) throws IOException { + write(csq.subSequence(start, end).toString()); + return null; + } + + @Override + public final Appendable append(char c) throws IOException { + write(c); + return this; + } +} + + +class NullCharArr extends CharArr { + public NullCharArr() { + super(new char[1], 0, 0); + } + + @Override + public void unsafeWrite(char b) { + } + + @Override + public void unsafeWrite(char b[], int off, int len) { + } + + @Override + public void unsafeWrite(int b) { + } + + @Override + public void write(char b) { + } + + @Override + public void write(char b[], int off, int len) { + } + + @Override + public void reserve(int num) { + } + + @Override + protected void resize(int len) { + } + + @Override + public Appendable append(CharSequence csq, int start, int end) throws IOException { + return this; + } + + @Override + public char charAt(int index) { + return 0; + } + + @Override + public void write(String s, int stringOffset, int len) { + } +} + + +// IDEA: a subclass that refills the array from a reader? +class CharArrReader extends CharArr { + protected final Reader in; + + public CharArrReader(Reader in, int size) { + super(size); + this.in = in; + } + + @Override + public int read() throws IOException { + if (start >= end) fill(); + return start >= end ? -1 : buf[start++]; + } + + @Override + public int read(CharBuffer cb) throws IOException { + // empty the buffer and then read direct + int sz = size(); + if (sz > 0) cb.put(buf, start, end); + int sz2 = in.read(cb); + if (sz2 >= 0) return sz + sz2; + return sz > 0 ? sz : -1; + } + + @Override + public int fill() throws IOException { + if (start >= end) { + reset(); + } else if (start > 0) { + System.arraycopy(buf, start, buf, 0, size()); + end = size(); + start = 0; + } + /*** + // fill fully or not??? + do { + int sz = in.read(buf,end,buf.length-end); + if (sz==-1) return; + end+=sz; + } while (end < buf.length); + ***/ + + int sz = in.read(buf, end, buf.length - end); + if (sz > 0) end += sz; + return sz; + } + +} + + +class CharArrWriter extends CharArr { + protected Writer sink; + + @Override + public void flush() { + try { + sink.write(buf, start, end - start); + } catch (IOException e) { + throw new RuntimeException(e); + } + start = end = 0; + } + + @Override + public void write(char b) { + if (end >= buf.length) { + flush(); + } + unsafeWrite(b); + } + + @Override + public void write(char b[], int off, int len) { + int space = buf.length - end; + if (len < space) { + unsafeWrite(b, off, len); + } else if (len < buf.length) { + unsafeWrite(b, off, space); + flush(); + unsafeWrite(b, off + space, len - space); + } else { + flush(); + try { + sink.write(b, off, len); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public void write(String s, int stringOffset, int len) { + int space = buf.length - end; + if (len < space) { + s.getChars(stringOffset, stringOffset + len, buf, end); + end += len; + } else if (len < buf.length) { + // if the data to write is small enough, buffer it. + s.getChars(stringOffset, stringOffset + space, buf, end); + flush(); + s.getChars(stringOffset + space, stringOffset + len, buf, 0); + end = len - space; + } else { + flush(); + // don't buffer, just write to sink + try { + sink.write(s, stringOffset, len); + } catch (IOException e) { + throw new RuntimeException(e); + } + + } + } +} diff --git a/solr/solrj/src/java/org/noggit/JSONParser.java b/solr/solrj/src/java/org/noggit/JSONParser.java new file mode 100644 index 00000000000..8b1ac01bc72 --- /dev/null +++ b/solr/solrj/src/java/org/noggit/JSONParser.java @@ -0,0 +1,1297 @@ +/* + * Copyright 2006- Yonik Seeley + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + +import java.io.IOException; +import java.io.Reader; + + +public class JSONParser { + + /** + * Event indicating a JSON string value, including member names of objects + */ + public static final int STRING = 1; + /** + * Event indicating a JSON number value which fits into a signed 64 bit integer + */ + public static final int LONG = 2; + /** + * Event indicating a JSON number value which has a fractional part or an exponent + * and with string length <= 23 chars not including sign. This covers + * all representations of normal values for Double.toString(). + */ + public static final int NUMBER = 3; + /** + * Event indicating a JSON number value that was not produced by toString of any + * Java primitive numerics such as Double or Long. It is either + * an integer outside the range of a 64 bit signed integer, or a floating + * point value with a string representation of more than 23 chars. + */ + public static final int BIGNUMBER = 4; + /** + * Event indicating a JSON boolean + */ + public static final int BOOLEAN = 5; + /** + * Event indicating a JSON null + */ + public static final int NULL = 6; + /** + * Event indicating the start of a JSON object + */ + public static final int OBJECT_START = 7; + /** + * Event indicating the end of a JSON object + */ + public static final int OBJECT_END = 8; + /** + * Event indicating the start of a JSON array + */ + public static final int ARRAY_START = 9; + /** + * Event indicating the end of a JSON array + */ + public static final int ARRAY_END = 10; + /** + * Event indicating the end of input has been reached + */ + public static final int EOF = 11; + + + /** + * Flags to control parsing behavior + */ + public static final int ALLOW_COMMENTS = 1 << 0; + public static final int ALLOW_SINGLE_QUOTES = 1 << 1; + public static final int ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER = 1 << 2; + public static final int ALLOW_UNQUOTED_KEYS = 1 << 3; + public static final int ALLOW_UNQUOTED_STRING_VALUES = 1 << 4; + /** + * ALLOW_EXTRA_COMMAS causes any number of extra commas in arrays and objects to be ignored + * Note that a trailing comma in [] would be [,] (hence calling the feature "trailing" commas + * is either limiting or misleading. Since trailing commas is fundamentally incompatible with any future + * "fill-in-missing-values-with-null", it was decided to extend this feature to handle any + * number of extra commas. + */ + public static final int ALLOW_EXTRA_COMMAS = 1 << 5; + public static final int ALLOW_MISSING_COLON_COMMA_BEFORE_OBJECT = 1 << 6; + public static final int OPTIONAL_OUTER_BRACES = 1 << 7; + + public static final int FLAGS_STRICT = 0; + public static final int FLAGS_DEFAULT = ALLOW_COMMENTS | ALLOW_SINGLE_QUOTES | ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER | ALLOW_UNQUOTED_KEYS | ALLOW_UNQUOTED_STRING_VALUES | ALLOW_EXTRA_COMMAS; + + public static class ParseException extends RuntimeException { + public ParseException(String msg) { + super(msg); + } + } + + public static String getEventString(int e) { + switch (e) { + case STRING: + return "STRING"; + case LONG: + return "LONG"; + case NUMBER: + return "NUMBER"; + case BIGNUMBER: + return "BIGNUMBER"; + case BOOLEAN: + return "BOOLEAN"; + case NULL: + return "NULL"; + case OBJECT_START: + return "OBJECT_START"; + case OBJECT_END: + return "OBJECT_END"; + case ARRAY_START: + return "ARRAY_START"; + case ARRAY_END: + return "ARRAY_END"; + case EOF: + return "EOF"; + } + return "Unknown: " + e; + } + + private static final CharArr devNull = new NullCharArr(); + + protected int flags = FLAGS_DEFAULT; + + protected final char[] buf; // input buffer with JSON text in it + protected int start; // current position in the buffer + protected int end; // end position in the buffer (one past last valid index) + protected final Reader in; // optional reader to obtain data from + protected boolean eof = false; // true if the end of the stream was reached. + protected long gpos; // global position = gpos + start + + protected int event; // last event read + + protected int stringTerm; // The terminator for the last string we read: single quote, double quote, or 0 for unterminated. + + protected boolean missingOpeningBrace = false; + + public JSONParser(Reader in) { + this(in, new char[8192]); + // 8192 matches the default buffer size of a BufferedReader so double + // buffering of the data is avoided. + } + + public JSONParser(Reader in, char[] buffer) { + this.in = in; + this.buf = buffer; + } + + // idea - if someone passes us a CharArrayReader, we could + // directly use that buffer as it's protected. + + public JSONParser(char[] data, int start, int end) { + this.in = null; + this.buf = data; + this.start = start; + this.end = end; + } + + public JSONParser(String data) { + this(data, 0, data.length()); + } + + public JSONParser(String data, int start, int end) { + this.in = null; + this.start = start; + this.end = end; + this.buf = new char[end - start]; + data.getChars(start, end, buf, 0); + } + + public int getFlags() { + return flags; + } + + public int setFlags(int flags) { + int oldFlags = flags; + this.flags = flags; + return oldFlags; + } + + // temporary output buffer + private final CharArr out = new CharArr(64); + + // We need to keep some state in order to (at a minimum) know if + // we should skip ',' or ':'. + private byte[] stack = new byte[16]; + private int ptr = 0; // pointer into the stack of parser states + private byte state = 0; // current parser state + + // parser states stored in the stack + private static final byte DID_OBJSTART = 1; // '{' just read + private static final byte DID_ARRSTART = 2; // '[' just read + private static final byte DID_ARRELEM = 3; // array element just read + private static final byte DID_MEMNAME = 4; // object member name (map key) just read + private static final byte DID_MEMVAL = 5; // object member value (map val) just read + + // info about value that was just read (or is in the middle of being read) + private int valstate; + + // push current parser state (use at start of new container) + private final void push() { + if (ptr >= stack.length) { + // doubling here is probably overkill, but anything that needs to double more than + // once (32 levels deep) is very atypical anyway. + byte[] newstack = new byte[stack.length << 1]; + System.arraycopy(stack, 0, newstack, 0, stack.length); + stack = newstack; + } + stack[ptr++] = state; + } + + // pop parser state (use at end of container) + private final void pop() { + if (--ptr < 0) { + throw err("Unbalanced container"); + } else { + state = stack[ptr]; + } + } + + protected void fill() throws IOException { + if (in != null) { + gpos += end; + start = 0; + int num = in.read(buf, 0, buf.length); + end = num >= 0 ? num : 0; + } + if (start >= end) eof = true; + } + + private void getMore() throws IOException { + fill(); + if (start >= end) { + throw err(null); + } + } + + protected int getChar() throws IOException { + if (start >= end) { + fill(); + if (start >= end) return -1; + } + return buf[start++]; + } + + /** + * Returns true if the given character is considered to be whitespace. + * One difference between Java's Character.isWhitespace() is that this method + * considers a hard space (non-breaking space, or nbsp) to be whitespace. + */ + protected static final boolean isWhitespace(int ch) { + return (Character.isWhitespace(ch) || ch == 0x00a0); + } + + private static final long WS_MASK = (1L << ' ') | (1L << '\t') | (1L << '\r') | (1L << '\n') | (1L << '#') | (1L << '/') | (0x01); // set 1 bit so 0xA0 will be flagged as whitespace + + protected int getCharNWS() throws IOException { + for (; ; ) { + int ch = getChar(); + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + // even w/o checking the range first. We'll only get some false-positives while using bare strings (chars "IJMc") + if (((WS_MASK >> ch) & 0x01) == 0) { + return ch; + } else if (ch <= ' ') { // this will only be true if one of the whitespace bits was set + continue; + } else if (ch == '/') { + getSlashComment(); + } else if (ch == '#') { + getNewlineComment(); + } else if (!isWhitespace(ch)) { // we'll only reach here with certain bare strings, errors, or strange whitespace like 0xa0 + return ch; + } + + /*** + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + if (ch < 64) { + if (((WS_MASK >> ch) & 0x01) == 0) return ch; + if (ch <= ' ') continue; // whitespace below a normal space + if (ch=='/') { + getSlashComment(); + } else if (ch=='#') { + getNewlineComment(); + } + } else if (!isWhitespace(ch)) { // check for higher whitespace like 0xA0 + return ch; + } + ***/ + + /** older code + switch (ch) { + case ' ' : + case '\t' : + case '\r' : + case '\n' : + continue outer; + case '#' : + getNewlineComment(); + continue outer; + case '/' : + getSlashComment(); + continue outer; + default: + return ch; + } + **/ + } + } + + protected int getCharNWS(int ch) throws IOException { + for (; ; ) { + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + // even w/o checking the range first. We'll only get some false-positives while using bare strings (chars "IJMc") + if (((WS_MASK >> ch) & 0x01) == 0) { + return ch; + } else if (ch <= ' ') { // this will only be true if one of the whitespace bits was set + // whitespace... get new char at bottom of loop + } else if (ch == '/') { + getSlashComment(); + } else if (ch == '#') { + getNewlineComment(); + } else if (!isWhitespace(ch)) { // we'll only reach here with certain bare strings, errors, or strange whitespace like 0xa0 + return ch; + } + ch = getChar(); + } + } + + protected int getCharExpected(int expected) throws IOException { + for (; ; ) { + int ch = getChar(); + if (ch == expected) return expected; + if (ch == ' ') continue; + return getCharNWS(ch); + } + } + + protected void getNewlineComment() throws IOException { + // read a # or a //, so go until newline + for (; ; ) { + int ch = getChar(); + // don't worry about DOS /r/n... we'll stop on the \r and let the rest of the whitespace + // eater consume the \n + if (ch == '\n' || ch == '\r' || ch == -1) { + return; + } + } + } + + protected void getSlashComment() throws IOException { + int ch = getChar(); + if (ch == '/') { + getNewlineComment(); + return; + } + + if (ch != '*') { + throw err("Invalid comment: expected //, /*, or #"); + } + + ch = getChar(); + for (; ; ) { + if (ch == '*') { + ch = getChar(); + if (ch == '/') { + return; + } else if (ch == '*') { + // handle cases of *******/ + continue; + } + } + if (ch == -1) { + return; + } + ch = getChar(); + } + } + + + protected boolean matchBareWord(char[] arr) throws IOException { + for (int i = 1; i < arr.length; i++) { + int ch = getChar(); + if (ch != arr[i]) { + if ((flags & ALLOW_UNQUOTED_STRING_VALUES) == 0) { + throw err("Expected " + new String(arr)); + } else { + stringTerm = 0; + out.reset(); + out.write(arr, 0, i); + if (!eof) { + start--; + } + return false; + } + } + } + + // if we don't allow bare strings, we don't need to check that the string actually terminates... just + // let things fail as the parser tries to continue + if ((flags & ALLOW_UNQUOTED_STRING_VALUES) == 0) { + return true; + } + + // check that the string actually terminates... for example trueX should return false + int ch = getChar(); + if (eof) { + return true; + } else if (!isUnquotedStringChar(ch)) { + start--; + return true; + } + + // we encountered something like "trueX" when matching "true" + stringTerm = 0; + out.reset(); + out.unsafeWrite(arr, 0, arr.length); + out.unsafeWrite(ch); + return false; + } + + protected ParseException err(String msg) { + // We can't tell if EOF was hit by comparing start<=end + // because the illegal char could have been the last in the buffer + // or in the stream. To deal with this, the "eof" var was introduced + if (!eof && start > 0) start--; // backup one char + String chs = "char=" + ((start >= end) ? "(EOF)" : "" + buf[start]); + String pos = "position=" + (gpos + start); + String tot = chs + ',' + pos + getContext(); + if (msg == null) { + if (start >= end) msg = "Unexpected EOF"; + else msg = "JSON Parse Error"; + } + return new ParseException(msg + ": " + tot); + } + + private String getContext() { + String context = ""; + if (start >= 0) { + context += " AFTER='" + errEscape(Math.max(start - 60, 0), start + 1) + "'"; + } + if (start < end) { + context += " BEFORE='" + errEscape(start + 1, start + 40) + "'"; + } + return context; + } + + private String errEscape(int a, int b) { + b = Math.min(b, end); + if (a >= b) return ""; + return new String(buf, a, b - a).replaceAll("\\s+", " "); + } + + + private boolean bool; // boolean value read + private long lval; // long value read + private int nstate; // current state while reading a number + private static final int HAS_FRACTION = 0x01; // nstate flag, '.' already read + private static final int HAS_EXPONENT = 0x02; // nstate flag, '[eE][+-]?[0-9]' already read + + /** + * Returns the long read... only significant if valstate==LONG after + * this call. firstChar should be the first numeric digit read. + */ + private long readNumber(int firstChar, boolean isNeg) throws IOException { + out.unsafeWrite(firstChar); // unsafe OK since we know output is big enough + // We build up the number in the negative plane since it's larger (by one) than + // the positive plane. + long v = '0' - firstChar; + // can't overflow a long in 18 decimal digits (i.e. 17 additional after the first). + // we also need 22 additional to handle double so we'll handle in 2 separate loops. + int i; + for (i = 0; i < 17; i++) { + int ch = getChar(); + // TODO: is this switch faster as an if-then-else? + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + v = v * 10 - (ch - '0'); + out.unsafeWrite(ch); + continue; + case '.': + out.unsafeWrite('.'); + valstate = readFrac(out, 22 - i); + return 0; + case 'e': + case 'E': + out.unsafeWrite(ch); + nstate = 0; + valstate = readExp(out, 22 - i); + return 0; + default: + // return the number, relying on nextEvent() to return an error + // for invalid chars following the number. + if (ch != -1) --start; // push back last char if not EOF + + valstate = LONG; + return isNeg ? v : -v; + } + } + + // after this, we could overflow a long and need to do extra checking + boolean overflow = false; + long maxval = isNeg ? Long.MIN_VALUE : -Long.MAX_VALUE; + + for (; i < 22; i++) { + int ch = getChar(); + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (v < (0x8000000000000000L / 10)) overflow = true; // can't multiply by 10 w/o overflowing + v *= 10; + int digit = ch - '0'; + if (v < maxval + digit) overflow = true; // can't add digit w/o overflowing + v -= digit; + out.unsafeWrite(ch); + continue; + case '.': + out.unsafeWrite('.'); + valstate = readFrac(out, 22 - i); + return 0; + case 'e': + case 'E': + out.unsafeWrite(ch); + nstate = 0; + valstate = readExp(out, 22 - i); + return 0; + default: + // return the number, relying on nextEvent() to return an error + // for invalid chars following the number. + if (ch != -1) --start; // push back last char if not EOF + + valstate = overflow ? BIGNUMBER : LONG; + return isNeg ? v : -v; + } + } + + + nstate = 0; + valstate = BIGNUMBER; + return 0; + } + + + // read digits right of decimal point + private int readFrac(CharArr arr, int lim) throws IOException { + nstate = HAS_FRACTION; // deliberate set instead of '|' + while (--lim >= 0) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else if (ch == 'e' || ch == 'E') { + arr.write(ch); + return readExp(arr, lim); + } else { + if (ch != -1) start--; // back up + return NUMBER; + } + } + return BIGNUMBER; + } + + + // call after 'e' or 'E' has been seen to read the rest of the exponent + private int readExp(CharArr arr, int lim) throws IOException { + nstate |= HAS_EXPONENT; + int ch = getChar(); + lim--; + + if (ch == '+' || ch == '-') { + arr.write(ch); + ch = getChar(); + lim--; + } + + // make sure at least one digit is read. + if (ch < '0' || ch > '9') { + throw err("missing exponent number"); + } + arr.write(ch); + + return readExpDigits(arr, lim); + } + + // continuation of readExpStart + private int readExpDigits(CharArr arr, int lim) throws IOException { + while (--lim >= 0) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else { + if (ch != -1) start--; // back up + return NUMBER; + } + } + return BIGNUMBER; + } + + private void continueNumber(CharArr arr) throws IOException { + if (arr != out) arr.write(out); + + if ((nstate & HAS_EXPONENT) != 0) { + readExpDigits(arr, Integer.MAX_VALUE); + return; + } + if (nstate != 0) { + readFrac(arr, Integer.MAX_VALUE); + return; + } + + for (; ; ) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else if (ch == '.') { + arr.write(ch); + readFrac(arr, Integer.MAX_VALUE); + return; + } else if (ch == 'e' || ch == 'E') { + arr.write(ch); + readExp(arr, Integer.MAX_VALUE); + return; + } else { + if (ch != -1) start--; + return; + } + } + } + + + private int hexval(int hexdig) { + if (hexdig >= '0' && hexdig <= '9') { + return hexdig - '0'; + } else if (hexdig >= 'A' && hexdig <= 'F') { + return hexdig + (10 - 'A'); + } else if (hexdig >= 'a' && hexdig <= 'f') { + return hexdig + (10 - 'a'); + } + throw err("invalid hex digit"); + } + + // backslash has already been read when this is called + private char readEscapedChar() throws IOException { + int ch = getChar(); + switch (ch) { + case '"': + return '"'; + case '\'': + return '\''; + case '\\': + return '\\'; + case '/': + return '/'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'f': + return '\f'; + case 'b': + return '\b'; + case 'u': + return (char) ( + (hexval(getChar()) << 12) + | (hexval(getChar()) << 8) + | (hexval(getChar()) << 4) + | (hexval(getChar()))); + } + if ((flags & ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER) != 0 && ch != EOF) { + return (char) ch; + } + throw err("Invalid character escape"); + } + + // a dummy buffer we can use to point at other buffers + private final CharArr tmp = new CharArr(null, 0, 0); + + private CharArr readStringChars() throws IOException { + if (stringTerm == 0) { + // "out" will already contain the first part of the bare string, so don't reset it + readStringBare(out); + return out; + } + + char terminator = (char) stringTerm; + int i; + for (i = start; i < end; i++) { + char c = buf[i]; + if (c == terminator) { + tmp.set(buf, start, i); // directly use input buffer + start = i + 1; // advance past last '"' + return tmp; + } else if (c == '\\') { + break; + } + } + out.reset(); + readStringChars2(out, i); + return out; + } + + + // middle is the pointer to the middle of a buffer to start scanning for a non-string + // character ('"' or "/"). start<=middle= end) { + arr.write(buf, start, middle - start); + start = middle; + getMore(); + middle = start; + } + int ch = buf[middle++]; + if (ch == terminator) { + int len = middle - start - 1; + if (len > 0) arr.write(buf, start, len); + start = middle; + return; + } else if (ch == '\\') { + int len = middle - start - 1; + if (len > 0) arr.write(buf, start, len); + start = middle; + arr.write(readEscapedChar()); + middle = start; + } + } + } + + private void readStringBare(CharArr arr) throws IOException { + if (arr != out) { + arr.append(out); + } + + for (; ; ) { + int ch = getChar(); + if (!isUnquotedStringChar(ch)) { + if (ch == -1) break; + if (ch == '\\') { + arr.write(readEscapedChar()); + continue; + } + start--; + break; + } + + if (ch == '\\') { + arr.write(readEscapedChar()); + continue; + } + + arr.write(ch); + } + } + + + // isName==true if this is a field name (as opposed to a value) + protected void handleNonDoubleQuoteString(int ch, boolean isName) throws IOException { + if (ch == '\'') { + stringTerm = ch; + if ((flags & ALLOW_SINGLE_QUOTES) == 0) { + throw err("Single quoted strings not allowed"); + } + } else { + if (isName && (flags & ALLOW_UNQUOTED_KEYS) == 0 + || !isName && (flags & ALLOW_UNQUOTED_STRING_VALUES) == 0 + || eof) { + if (isName) { + throw err("Expected quoted string"); + } else { + throw err(null); + } + } + + if (!isUnquotedStringStart(ch)) { + throw err(null); + } + + stringTerm = 0; // signal for unquoted string + out.reset(); + out.unsafeWrite(ch); + } + } + + private static boolean isUnquotedStringStart(int ch) { + return Character.isJavaIdentifierStart(ch); + } + + // What characters are allowed to continue an unquoted string + // once we know we are in one. + private static boolean isUnquotedStringChar(int ch) { + return Character.isJavaIdentifierPart(ch) + || ch == '.' + || ch == '-' + || ch == '/'; + + // would checking for a-z first speed up the common case? + + // possibly much more liberal unquoted string handling... + /*** + switch (ch) { + case -1: + case ' ': + case '\t': + case '\r': + case '\n': + case '}': + case ']': + case ',': + case ':': + case '=': // reserved for future use + case '\\': // check for backslash should come after this function call + return false; + } + return true; + ***/ + } + + + /*** alternate implementation + // middle is the pointer to the middle of a buffer to start scanning for a non-string + // character ('"' or "/"). start<=middle=end) { + getMore(); + middle=start; + } else { + start = middle+1; // set buffer pointer to correct spot + if (ch=='"') { + valstate=0; + return; + } else if (ch=='\\') { + arr.write(readEscapedChar()); + if (start>=end) getMore(); + middle=start; + } + } + } + } + ***/ + + + // return the next event when parser is in a neutral state (no + // map separators or array element separators to read + private int next(int ch) throws IOException { + // TODO: try my own form of indirect jump... look up char class and index directly into handling implementation? + for (; ; ) { + switch (ch) { + case ' ': // this is not the exclusive list of whitespace chars... the rest are handled in default: + case '\t': + case '\r': + case '\n': + ch = getCharNWS(); // calling getCharNWS here seems faster than letting the switch handle it + break; + case '"': + stringTerm = '"'; + valstate = STRING; + return STRING; + case '\'': + if ((flags & ALLOW_SINGLE_QUOTES) == 0) { + throw err("Single quoted strings not allowed"); + } + stringTerm = '\''; + valstate = STRING; + return STRING; + case '{': + push(); + state = DID_OBJSTART; + return OBJECT_START; + case '[': + push(); + state = DID_ARRSTART; + return ARRAY_START; + case '0': + out.reset(); + //special case '0'? If next char isn't '.' val=0 + ch = getChar(); + if (ch == '.') { + start--; + ch = '0'; + readNumber('0', false); + return valstate; + } else if (ch > '9' || ch < '0') { + out.unsafeWrite('0'); + if (ch != -1) start--; + lval = 0; + valstate = LONG; + return LONG; + } else { + throw err("Leading zeros not allowed"); + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + out.reset(); + lval = readNumber(ch, false); + return valstate; + case '-': + out.reset(); + out.unsafeWrite('-'); + ch = getChar(); + if (ch < '0' || ch > '9') throw err("expected digit after '-'"); + lval = readNumber(ch, true); + return valstate; + case 't': + // TODO: test performance of this non-branching inline version. + // if ((('r'-getChar())|('u'-getChar())|('e'-getChar())) != 0) throw err(""); + if (matchBareWord(JSONUtil.TRUE_CHARS)) { + bool = true; + valstate = BOOLEAN; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case 'f': + if (matchBareWord(JSONUtil.FALSE_CHARS)) { + bool = false; + valstate = BOOLEAN; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case 'n': + if (matchBareWord(JSONUtil.NULL_CHARS)) { + valstate = NULL; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case '/': + getSlashComment(); + ch = getChar(); + break; + case '#': + getNewlineComment(); + ch = getChar(); + break; + case ']': // This only happens with a trailing comma (or an error) + if (state != DID_ARRELEM || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected array closer ]"); + } + pop(); + return event = ARRAY_END; + case '}': // This only happens with a trailing comma (or an error) + if (state != DID_MEMVAL || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected object closer }"); + } + pop(); + return event = ARRAY_END; + case ',': // This only happens with input like [1,] + if ((state != DID_ARRELEM && state != DID_MEMVAL) || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected comma"); + } + ch = getChar(); + break; + case -1: + if (getLevel() > 0) throw err("Premature EOF"); + return EOF; + default: + // Handle unusual unicode whitespace like no-break space (0xA0) + if (isWhitespace(ch)) { + ch = getChar(); // getCharNWS() would also work + break; + } + handleNonDoubleQuoteString(ch, false); + valstate = STRING; + return STRING; + // throw err(null); + } + + } + } + + @Override + public String toString() { + return "start=" + start + ",end=" + end + ",state=" + state + "valstate=" + valstate; + } + + + /** + * Returns the next event encountered in the JSON stream, one of + *
    + *
  • {@link #STRING}
  • + *
  • {@link #LONG}
  • + *
  • {@link #NUMBER}
  • + *
  • {@link #BIGNUMBER}
  • + *
  • {@link #BOOLEAN}
  • + *
  • {@link #NULL}
  • + *
  • {@link #OBJECT_START}
  • + *
  • {@link #OBJECT_END}
  • + *
  • {@link #OBJECT_END}
  • + *
  • {@link #ARRAY_START}
  • + *
  • {@link #ARRAY_END}
  • + *
  • {@link #EOF}
  • + *
+ */ + public int nextEvent() throws IOException { + if (valstate != 0) { + if (valstate == STRING) { + readStringChars2(devNull, start); + } else if (valstate == BIGNUMBER) { + continueNumber(devNull); + } + valstate = 0; + } + + int ch; + outer: + for (; ; ) { + switch (state) { + case 0: + event = next(getChar()); + if (event == STRING && (flags & OPTIONAL_OUTER_BRACES) != 0) { + if (start > 0) start--; + missingOpeningBrace = true; + stringTerm = 0; + valstate = 0; + event = next('{'); + } + return event; + case DID_OBJSTART: + ch = getCharExpected('"'); + if (ch == '}') { + pop(); + return event = OBJECT_END; + } + if (ch == '"') { + stringTerm = ch; + } else if (ch == ',' && (flags & ALLOW_EXTRA_COMMAS) != 0) { + continue outer; + } else { + handleNonDoubleQuoteString(ch, true); + } + state = DID_MEMNAME; + valstate = STRING; + return event = STRING; + case DID_MEMNAME: + ch = getCharExpected(':'); + if (ch != ':') { + if ((ch == '{' || ch == '[') && (flags & ALLOW_MISSING_COLON_COMMA_BEFORE_OBJECT) != 0) { + start--; + } else { + throw err("Expected key,value separator ':'"); + } + } + state = DID_MEMVAL; // set state first because it might be pushed... + return event = next(getChar()); + case DID_MEMVAL: + ch = getCharExpected(','); + if (ch == '}') { + pop(); + return event = OBJECT_END; + } else if (ch != ',') { + if ((flags & ALLOW_EXTRA_COMMAS) != 0 && (ch == '\'' || ch == '"' || Character.isLetter(ch))) { + start--; + } else if (missingOpeningBrace && ch == -1 && (flags & OPTIONAL_OUTER_BRACES) != 0) { + missingOpeningBrace = false; + pop(); + return event = OBJECT_END; + } else throw err("Expected ',' or '}'"); + } + ch = getCharExpected('"'); + if (ch == '"') { + stringTerm = ch; + } else if ((ch == ',' || ch == '}') && (flags & ALLOW_EXTRA_COMMAS) != 0) { + if (ch == ',') continue outer; + pop(); + return event = OBJECT_END; + } else { + handleNonDoubleQuoteString(ch, true); + } + state = DID_MEMNAME; + valstate = STRING; + return event = STRING; + case DID_ARRSTART: + ch = getCharNWS(); + if (ch == ']') { + pop(); + return event = ARRAY_END; + } + state = DID_ARRELEM; // set state first, might be pushed... + return event = next(ch); + case DID_ARRELEM: + ch = getCharExpected(','); + if (ch == ',') { + // state = DID_ARRELEM; // redundant + return event = next(getChar()); + } else if (ch == ']') { + pop(); + return event = ARRAY_END; + } else { + if ((ch == '{' || ch == '[') && (flags & ALLOW_MISSING_COLON_COMMA_BEFORE_OBJECT) != 0) { + return event = next(ch); + } else { + throw err("Expected ',' or ']'"); + } + } + } + } // end for(;;) + } + + public int lastEvent() { + return event; + } + + public boolean wasKey() { + return state == DID_MEMNAME; + } + + + private void goTo(int what) throws IOException { + if (valstate == what) { + valstate = 0; + return; + } + if (valstate == 0) { + /*int ev = */ + nextEvent(); // TODO + if (valstate != what) { + throw err("type mismatch"); + } + valstate = 0; + } else { + throw err("type mismatch"); + } + } + + /** + * Returns the JSON string value, decoding any escaped characters. + */ + public String getString() throws IOException { + return getStringChars().toString(); + } + + /** + * Returns the characters of a JSON string value, decoding any escaped characters. + * The underlying buffer of the returned CharArr should *not* be + * modified as it may be shared with the input buffer. + * The returned CharArr will only be valid up until + * the next JSONParser method is called. Any required data should be + * read before that point. + */ + public CharArr getStringChars() throws IOException { + goTo(STRING); + return readStringChars(); + } + + /** + * Reads a JSON string into the output, decoding any escaped characters. + */ + public void getString(CharArr output) throws IOException { + goTo(STRING); + readStringChars2(output, start); + } + + /** + * Reads a number from the input stream and parses it as a long, only if + * the value will in fact fit into a signed 64 bit integer. + */ + public long getLong() throws IOException { + goTo(LONG); + return lval; + } + + /** + * Reads a number from the input stream and parses it as a double + */ + public double getDouble() throws IOException { + return Double.parseDouble(getNumberChars().toString()); + } + + /** + * Returns the characters of a JSON numeric value. + *

The underlying buffer of the returned CharArr should *not* be + * modified as it may be shared with the input buffer. + *

The returned CharArr will only be valid up until + * the next JSONParser method is called. Any required data should be + * read before that point. + */ + public CharArr getNumberChars() throws IOException { + int ev = 0; + if (valstate == 0) ev = nextEvent(); + + if (valstate == LONG || valstate == NUMBER) { + valstate = 0; + return out; + } else if (valstate == BIGNUMBER) { + continueNumber(out); + valstate = 0; + return out; + } else { + throw err("Unexpected " + ev); + } + } + + /** + * Reads a JSON numeric value into the output. + */ + public void getNumberChars(CharArr output) throws IOException { + int ev = 0; + if (valstate == 0) ev = nextEvent(); + if (valstate == LONG || valstate == NUMBER) output.write(this.out); + else if (valstate == BIGNUMBER) { + continueNumber(output); + } else { + throw err("Unexpected " + ev); + } + valstate = 0; + } + + /** + * Reads a boolean value + */ + public boolean getBoolean() throws IOException { + goTo(BOOLEAN); + return bool; + } + + /** + * Reads a null value + */ + public void getNull() throws IOException { + goTo(NULL); + } + + /** + * @return the current nesting level, the number of parent objects or arrays. + */ + public int getLevel() { + return ptr; + } + + public long getPosition() { + return gpos + start; + } +} \ No newline at end of file diff --git a/solr/solrj/src/java/org/noggit/JSONUtil.java b/solr/solrj/src/java/org/noggit/JSONUtil.java new file mode 100644 index 00000000000..4c74759c1f6 --- /dev/null +++ b/solr/solrj/src/java/org/noggit/JSONUtil.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.noggit; + + +public class JSONUtil { + public static final char[] TRUE_CHARS = new char[]{'t', 'r', 'u', 'e'}; + public static final char[] FALSE_CHARS = new char[]{'f', 'a', 'l', 's', 'e'}; + public static final char[] NULL_CHARS = new char[]{'n', 'u', 'l', 'l'}; + public static final char[] HEX_CHARS = new char[]{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; + public static final char VALUE_SEPARATOR = ','; + public static final char NAME_SEPARATOR = ':'; + public static final char OBJECT_START = '{'; + public static final char OBJECT_END = '}'; + public static final char ARRAY_START = '['; + public static final char ARRAY_END = ']'; + + public static String toJSON(Object o) { + CharArr out = new CharArr(); + new JSONWriter(out).write(o); + return out.toString(); + } + + /** + * @param o The object to convert to JSON + * @param indentSize The number of space characters to use as an indent (default 2). 0=newlines but no spaces, -1=no indent at all. + */ + public static String toJSON(Object o, int indentSize) { + CharArr out = new CharArr(); + new JSONWriter(out, indentSize).write(o); + return out.toString(); + } + + public static void writeNumber(int number, CharArr out) { + out.write(Integer.toString(number)); + } + + public static void writeNumber(long number, CharArr out) { + out.write(Long.toString(number)); + } + + public static void writeNumber(float number, CharArr out) { + out.write(Float.toString(number)); + } + + public static void writeNumber(double number, CharArr out) { + out.write(Double.toString(number)); + } + + public static void writeString(CharArr val, CharArr out) { + writeString(val.getArray(), val.getStart(), val.getEnd(), out); + } + + public static void writeString(char[] val, int start, int end, CharArr out) { + out.write('"'); + writeStringPart(val, start, end, out); + out.write('"'); + } + + public static void writeString(String val, int start, int end, CharArr out) { + out.write('"'); + writeStringPart(val, start, end, out); + out.write('"'); + } + + public static void writeString(CharSequence val, int start, int end, CharArr out) { + out.write('"'); + writeStringPart(val, start, end, out); + out.write('"'); + } + + public static void writeStringPart(char[] val, int start, int end, CharArr out) { + for (int i = start; i < end; i++) { + char ch = val[i]; + // When ch>=1f, (ch*146087937)&0xd6a01f80) is 0 only for characters that need escaping: " \\ u2028 u2029 + // and has 7 false positives: 204a 4051 802f c022 c044 e04a e04b + if (ch > 0x1f && ((ch * 146087937) & 0xd6a01f80) != 0) { + out.write(ch); + } else { + writeChar(ch, out); + } + } + } + + public static void writeChar(char ch, CharArr out) { + switch (ch) { + case '"': + case '\\': + out.write('\\'); + out.write(ch); + break; + case '\r': + out.write('\\'); + out.write('r'); + break; + case '\n': + out.write('\\'); + out.write('n'); + break; + case '\t': + out.write('\\'); + out.write('t'); + break; + case '\b': + out.write('\\'); + out.write('b'); + break; + case '\f': + out.write('\\'); + out.write('f'); + break; + // case '/': + case '\u2028': // valid JSON, but not valid json script + case '\u2029': + unicodeEscape(ch, out); + break; + default: + if (ch <= 0x1F) { + unicodeEscape(ch, out); + } else { + out.write(ch); + } + } + } + + + public static void writeStringPart(String chars, int start, int end, CharArr out) { + // TODO: write in chunks? + + int toWrite = end - start; + char[] arr = out.getArray(); + int pos = out.getEnd(); + int space = arr.length - pos; + if (space < toWrite) { + writeStringPart((CharSequence) chars, start, end, out); + return; + } + + // get chars directly from String into output array + chars.getChars(start, end, arr, pos); + + int endInOut = pos + toWrite; + out.setEnd(endInOut); + for (int i = pos; i < endInOut; i++) { + char ch = arr[i]; + + // When ch>=1f, (ch*146087937)&0xd6a01f80) is 0 only for characters that need escaping: " \\ u2028 u2029 + // and has 7 false positives: 204a 4051 802f c022 c044 e04a e04b + if (ch <= 0x1f || ((ch * 146087937) & 0xd6a01f80) == 0) { + // We hit a char that needs escaping. do the rest char by char. + out.setEnd(i); + writeStringPart((CharSequence) chars, start + (i - pos), end, out); + return; + } + } + } + + public static void writeStringPart(CharSequence chars, int start, int end, CharArr out) { + for (int i = start; i < end; i++) { + char ch = chars.charAt(i); + // When ch>=1f, (ch*146087937)&0xd6a01f80) is 0 only for characters that need escaping: " \\ u2028 u2029 + // and has 7 false positives: 204a 4051 802f c022 c044 e04a e04b + if (ch > 0x1f && ((ch * 146087937) & 0xd6a01f80) != 0) { + out.write(ch); + } else { + writeChar(ch, out); + } + } + } + + + public static void unicodeEscape(int ch, CharArr out) { + out.write('\\'); + out.write('u'); + out.write(HEX_CHARS[ch >>> 12]); + out.write(HEX_CHARS[(ch >>> 8) & 0xf]); + out.write(HEX_CHARS[(ch >>> 4) & 0xf]); + out.write(HEX_CHARS[ch & 0xf]); + } + + public static void writeNull(CharArr out) { + out.write(NULL_CHARS); + } + + public static void writeBoolean(boolean val, CharArr out) { + out.write(val ? TRUE_CHARS : FALSE_CHARS); + } + +} \ No newline at end of file diff --git a/solr/solrj/src/java/org/noggit/JSONWriter.java b/solr/solrj/src/java/org/noggit/JSONWriter.java new file mode 100644 index 00000000000..dfec390db4c --- /dev/null +++ b/solr/solrj/src/java/org/noggit/JSONWriter.java @@ -0,0 +1,358 @@ +/* + * Copyright 2006- Yonik Seeley + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + +import java.util.*; + + +public class JSONWriter { + + /** + * Implement this interface on your class to support serialization + */ + public static interface Writable { + public void write(JSONWriter writer); + } + + protected int level; + protected int indent; + protected final CharArr out; + + /** + * @param out the CharArr to write the output to. + * @param indentSize The number of space characters to use as an indent (default 2). 0=newlines but no spaces, -1=no indent at all. + */ + public JSONWriter(CharArr out, int indentSize) { + this.out = out; + this.indent = indentSize; + } + + public JSONWriter(CharArr out) { + this(out, 2); + } + + public void setIndentSize(int indentSize) { + this.indent = indentSize; + } + + public void indent() { + if (indent >= 0) { + out.write('\n'); + if (indent > 0) { + int spaces = level * indent; + out.reserve(spaces); + for (int i = 0; i < spaces; i++) { + out.unsafeWrite(' '); + } + } + } + } + + public void write(Object o) { + // NOTE: an instance-of chain was about 50% faster than hashing on the classes, even with perfect hashing. + if (o == null) { + writeNull(); + } else if (o instanceof String) { + writeString((String) o); + } else if (o instanceof Number) { + if (o instanceof Integer || o instanceof Long) { + write(((Number) o).longValue()); + } else if (o instanceof Float || o instanceof Double) { + write(((Number) o).doubleValue()); + } else { + CharArr arr = new CharArr(); + arr.write(o.toString()); + writeNumber(arr); + } + } else if (o instanceof Map) { + write((Map) o); + } else if (o instanceof Collection) { + write((Collection) o); + } else if (o instanceof Boolean) { + write(((Boolean) o).booleanValue()); + } else if (o instanceof CharSequence) { + writeString((CharSequence) o); + } else if (o instanceof Writable) { + ((Writable) o).write(this); + } else if (o instanceof Object[]) { + write(Arrays.asList((Object[]) o)); + } else if (o instanceof int[]) { + write((int[]) o); + } else if (o instanceof float[]) { + write((float[]) o); + } else if (o instanceof long[]) { + write((long[]) o); + } else if (o instanceof double[]) { + write((double[]) o); + } else if (o instanceof short[]) { + write((short[]) o); + } else if (o instanceof boolean[]) { + write((boolean[]) o); + } else if (o instanceof char[]) { + write((char[]) o); + } else if (o instanceof byte[]) { + write((byte[]) o); + } else { + handleUnknownClass(o); + } + } + + /** + * Override this method for custom handling of unknown classes. Also see the Writable interface. + */ + public void handleUnknownClass(Object o) { + writeString(o.toString()); + } + + public void write(Map val) { + startObject(); + int sz = val.size(); + boolean first = true; + for (Map.Entry entry : val.entrySet()) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + if (sz > 1) indent(); + writeString(entry.getKey().toString()); + writeNameSeparator(); + write(entry.getValue()); + } + endObject(); + } + + public void write(Collection val) { + startArray(); + int sz = val.size(); + boolean first = true; + for (Object o : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + if (sz > 1) indent(); + write(o); + } + endArray(); + } + + /** + * A byte[] may be either a single logical value, or a list of small integers. + * It's up to the implementation to decide. + */ + public void write(byte[] val) { + startArray(); + boolean first = true; + for (short v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(short[] val) { + startArray(); + boolean first = true; + for (short v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(int[] val) { + startArray(); + boolean first = true; + for (int v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(long[] val) { + startArray(); + boolean first = true; + for (long v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(float[] val) { + startArray(); + boolean first = true; + for (float v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(double[] val) { + startArray(); + boolean first = true; + for (double v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + public void write(boolean[] val) { + startArray(); + boolean first = true; + for (boolean v : val) { + if (first) { + first = false; + } else { + writeValueSeparator(); + } + write(v); + } + endArray(); + } + + + public void write(short number) { + write((int) number); + } + + public void write(byte number) { + write((int) number); + } + + + public void writeNull() { + JSONUtil.writeNull(out); + } + + public void writeString(String str) { + JSONUtil.writeString(str, 0, str.length(), out); + } + + public void writeString(CharSequence str) { + JSONUtil.writeString(str, 0, str.length(), out); + } + + public void writeString(CharArr str) { + JSONUtil.writeString(str, out); + } + + public void writeStringStart() { + out.write('"'); + } + + public void writeStringChars(CharArr partialStr) { + JSONUtil.writeStringPart(partialStr.getArray(), partialStr.getStart(), partialStr.getEnd(), out); + } + + public void writeStringEnd() { + out.write('"'); + } + + public void write(long number) { + JSONUtil.writeNumber(number, out); + } + + public void write(int number) { + JSONUtil.writeNumber(number, out); + } + + public void write(double number) { + JSONUtil.writeNumber(number, out); + } + + public void write(float number) { + JSONUtil.writeNumber(number, out); + } + + public void write(boolean bool) { + JSONUtil.writeBoolean(bool, out); + } + + public void write(char[] val) { + JSONUtil.writeString(val, 0, val.length, out); + } + + public void writeNumber(CharArr digits) { + out.write(digits); + } + + public void writePartialNumber(CharArr digits) { + out.write(digits); + } + + public void startObject() { + out.write('{'); + level++; + } + + public void endObject() { + out.write('}'); + level--; + } + + public void startArray() { + out.write('['); + level++; + } + + public void endArray() { + out.write(']'); + level--; + } + + public void writeValueSeparator() { + out.write(','); + } + + public void writeNameSeparator() { + out.write(':'); + } + +} diff --git a/solr/solrj/src/java/org/noggit/ObjectBuilder.java b/solr/solrj/src/java/org/noggit/ObjectBuilder.java new file mode 100644 index 00000000000..945a96b20b0 --- /dev/null +++ b/solr/solrj/src/java/org/noggit/ObjectBuilder.java @@ -0,0 +1,168 @@ +/* + * Copyright 2006- Yonik Seeley + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + + +import java.util.*; +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; + +public class ObjectBuilder { + + public static Object fromJSON(String json) throws IOException { + JSONParser p = new JSONParser(json); + return getVal(p); + } + + public static Object getVal(JSONParser parser) throws IOException { + return new ObjectBuilder(parser).getVal(); + } + + final JSONParser parser; + + public ObjectBuilder(JSONParser parser) throws IOException { + this.parser = parser; + if (parser.lastEvent() == 0) parser.nextEvent(); + } + + + public Object getVal() throws IOException { + int ev = parser.lastEvent(); + switch (ev) { + case JSONParser.STRING: + return getString(); + case JSONParser.LONG: + return getLong(); + case JSONParser.NUMBER: + return getNumber(); + case JSONParser.BIGNUMBER: + return getBigNumber(); + case JSONParser.BOOLEAN: + return getBoolean(); + case JSONParser.NULL: + return getNull(); + case JSONParser.OBJECT_START: + return getObject(); + case JSONParser.OBJECT_END: + return null; // OR ERROR? + case JSONParser.ARRAY_START: + return getArray(); + case JSONParser.ARRAY_END: + return null; // OR ERROR? + case JSONParser.EOF: + return null; // OR ERROR? + default: + return null; // OR ERROR? + } + } + + + public Object getString() throws IOException { + return parser.getString(); + } + + public Object getLong() throws IOException { + return Long.valueOf(parser.getLong()); + } + + public Object getNumber() throws IOException { + CharArr num = parser.getNumberChars(); + String numstr = num.toString(); + double d = Double.parseDouble(numstr); + if (!Double.isInfinite(d)) return Double.valueOf(d); + // TODO: use more efficient constructor in Java5 + return new BigDecimal(num.buf, num.start, num.size()); + } + + public Object getBigNumber() throws IOException { + CharArr num = parser.getNumberChars(); + String numstr = num.toString(); + for (int ch; (ch = num.read()) != -1; ) { + if (ch == '.' || ch == 'e' || ch == 'E') return new BigDecimal(numstr); + } + return new BigInteger(numstr); + } + + public Object getBoolean() throws IOException { + return parser.getBoolean(); + } + + public Object getNull() throws IOException { + parser.getNull(); + return null; + } + + public Object newObject() throws IOException { + return new LinkedHashMap(); + } + + public Object getKey() throws IOException { + return parser.getString(); + } + + @SuppressWarnings("unchecked") + public void addKeyVal(Object map, Object key, Object val) throws IOException { + /* Object prev = */ + ((Map) map).put(key, val); + // TODO: test for repeated value? + } + + public Object objectEnd(Object obj) { + return obj; + } + + + public Object getObject() throws IOException { + Object m = newObject(); + for (; ; ) { + int ev = parser.nextEvent(); + if (ev == JSONParser.OBJECT_END) return objectEnd(m); + Object key = getKey(); + ev = parser.nextEvent(); + Object val = getVal(); + addKeyVal(m, key, val); + } + } + + public Object newArray() { + return new ArrayList(); + } + + @SuppressWarnings("unchecked") + public void addArrayVal(Object arr, Object val) throws IOException { + ((List) arr).add(val); + } + + public Object endArray(Object arr) { + return arr; + } + + public Object getArray() throws IOException { + Object arr = newArray(); + for (; ; ) { + int ev = parser.nextEvent(); + if (ev == JSONParser.ARRAY_END) return endArray(arr); + Object val = getVal(); + addArrayVal(arr, val); + } + } + +} diff --git a/solr/solrj/src/test/org/noggit/TestJSONParser.java b/solr/solrj/src/test/org/noggit/TestJSONParser.java new file mode 100644 index 00000000000..8446cf97a04 --- /dev/null +++ b/solr/solrj/src/test/org/noggit/TestJSONParser.java @@ -0,0 +1,690 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.solr.SolrTestCaseJ4; +import org.junit.Test; + +public class TestJSONParser extends SolrTestCaseJ4 { + + // these are to aid in debugging if an unexpected error occurs + static int parserType; + static int bufferSize; + static String parserInput; + static JSONParser lastParser; + + static int flags = JSONParser.FLAGS_DEFAULT; // the default + + public static String lastParser() { + return "parserType=" + parserType + + (parserType==1 ? " bufferSize=" + bufferSize : "") + + " parserInput='" + parserInput + "'" + "flags : " + lastParser.flags; + } + + public static JSONParser getParser(String s) { + return getParser(s, random().nextInt(2), -1); + } + + public static JSONParser getParser(String s, int type, int bufSize) { + parserInput = s; + parserType = type; + + JSONParser parser=null; + switch (type) { + case 0: + // test directly using input buffer + parser = new JSONParser(s.toCharArray(),0,s.length()); + break; + case 1: + // test using Reader... + // small input buffers can help find bugs on boundary conditions + + if (bufSize < 1) bufSize = random().nextInt(25) + 1; + bufferSize = bufSize;// record in case there is an error + parser = new JSONParser(new StringReader(s), new char[bufSize]); + break; + } + if (parser == null) return null; + + lastParser = parser; + + if (flags != JSONParser.FLAGS_DEFAULT) { + parser.setFlags(flags); + } + + return parser; + } + + /** for debugging purposes + public void testSpecific() throws Exception { + JSONParser parser = getParser("[0",1,1); + for (;;) { + int ev = parser.nextEvent(); + if (ev == JSONParser.EOF) { + break; + } else { + System.out.println("got " + JSONParser.getEventString(ev)); + } + } + } + **/ + + public static byte[] events = new byte[256]; + static { + events['{'] = JSONParser.OBJECT_START; + events['}'] = JSONParser.OBJECT_END; + events['['] = JSONParser.ARRAY_START; + events[']'] = JSONParser.ARRAY_END; + events['s'] = JSONParser.STRING; + events['b'] = JSONParser.BOOLEAN; + events['l'] = JSONParser.LONG; + events['n'] = JSONParser.NUMBER; + events['N'] = JSONParser.BIGNUMBER; + events['0'] = JSONParser.NULL; + events['e'] = JSONParser.EOF; + } + + // match parser states with the expected states + public static void parse(JSONParser p, String input, String expected) throws IOException { + expected += "e"; + for (int i=0; i>1) + 1; + for (int j=0; j L(Object... lst) { + return Arrays.asList(lst); + } + public static Object[] A(Object... lst) { + return lst; + } + public static Map O(Object... lst) { + LinkedHashMap map = new LinkedHashMap(); + for (int i=0; i val = new LinkedHashMap(); + val.put("a",1); + val.put("b",2); + writer.write(val); + } + } + + @Test + public void testWritable() throws Exception { + test("[{'a':1,'b':2}]", L(new Custom()), -1); + test("[10,{'a':1,'b':2},20]", L(10, new Custom(), 20), -1); + } + + @Test + public void testUnknown() throws Exception { + test("['a,\\\"b\\\",c']", L(new Unknown()), -1); + } + +} diff --git a/solr/solrj/src/test/org/noggit/TestObjectBuilder.java b/solr/solrj/src/test/org/noggit/TestObjectBuilder.java new file mode 100644 index 00000000000..e4a75049bfa --- /dev/null +++ b/solr/solrj/src/test/org/noggit/TestObjectBuilder.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.noggit; + +import java.io.IOException; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.solr.SolrTestCaseJ4; +import org.junit.Test; + +public class TestObjectBuilder extends SolrTestCaseJ4 { + + public void _test(String val, Object expected) throws IOException { + val = val.replace('\'','"'); + Object v = ObjectBuilder.fromJSON(val); + + String s1 = JSONUtil.toJSON(v,-1); + String s2 = JSONUtil.toJSON(expected,-1); + assertEquals(s1, s2); + + // not make sure that it round-trips correctly + JSONParser p2 = TestJSONParser.getParser(s1); + Object v2 = ObjectBuilder.getVal(p2); + String s3 = JSONUtil.toJSON(v2,-1); + assertEquals(s1, s3); + } + + public static List L(Object... lst) { + return Arrays.asList(lst); + } + public static Object[] A(Object... lst) { + return lst; + } + public static Map O(Object... lst) { + LinkedHashMap map = new LinkedHashMap(); + for (int i=0; i