Categories:
Audio (13)
Biotech (29)
Bytecode (36)
Database (77)
Framework (7)
Game (7)
General (507)
Graphics (53)
I/O (35)
IDE (2)
JAR Tools (101)
JavaBeans (21)
JDBC (121)
JDK (426)
JSP (20)
Logging (108)
Mail (58)
Messaging (8)
Network (84)
PDF (97)
Report (7)
Scripting (84)
Security (32)
Server (121)
Servlet (26)
SOAP (24)
Testing (54)
Web (15)
XML (309)
Collections:
Other Resources:
Woodstox 6.4.0 - Source Code Files
Woodstox 6.4.0 Source Code Files are provided at the Woodstox GitHub Website.
You can download them from the "src/main/java" folder.
You can also browse Woodstox Source Code files below:
✍: FYIcenter
⏎ com/ctc/wstx/io/UTF8Reader.java
/* Woodstox XML processor * * Copyright (c) 2004 Tatu Saloranta, tatu.saloranta@iki.fi * * Licensed under the License specified in file LICENSE, included with * the source code. * You may not use this file except in compliance with the License. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ctc.wstx.io; import java.io.*; import com.ctc.wstx.api.ReaderConfig; import com.ctc.wstx.cfg.XmlConsts; /** * Optimized Reader that reads UTF-8 encoded content from an input stream. * In addition to doing (hopefully) optimal conversion, it can also take * array of "pre-read" (leftover) bytes; this is necessary when preliminary * stream/reader is trying to figure out XML encoding. */ public final class UTF8Reader extends BaseReader { boolean mXml11 = false; char mSurrogate = NULL_CHAR; /** * Total read character count; used for error reporting purposes */ int mCharCount = 0; /** * Total read byte count; used for error reporting purposes */ int mByteCount = 0; /* //////////////////////////////////////// // Life-cycle //////////////////////////////////////// */ public UTF8Reader(ReaderConfig cfg, InputStream in, byte[] buf, int ptr, int len, boolean recycleBuffer) { super(cfg, in, buf, ptr, len, recycleBuffer); } @Override public void setXmlCompliancy(int xmlVersion) { mXml11 = (xmlVersion == XmlConsts.XML_V_11); } /* //////////////////////////////////////// // Public API //////////////////////////////////////// */ @SuppressWarnings("cast") @Override public int read(char[] cbuf, int start, int len) throws IOException { // Let's first ensure there's enough room... if (start < 0 || (start+len) > cbuf.length) { reportBounds(cbuf, start, len); } // Already EOF? if (mByteBuffer == null) { return -1; } if (len < 1) { // dummy call? return 0; } len += start; int outPtr = start; // Ok, first; do we have a surrogate from last round? if (mSurrogate != NULL_CHAR) { cbuf[outPtr++] = mSurrogate; mSurrogate = NULL_CHAR; // No need to load more, already got one char // 05-Apr-2021, tatu: but if at the end must return: if (mBytePtr >= mByteBufferEnd) { mCharCount += 1; return 1; } } else { // To prevent unnecessary blocking (esp. with network streams), // we'll only require decoding of a single char int left = (mByteBufferEnd - mBytePtr); /* So; only need to load more if we can't provide at least * one more character. We need not do thorough check here, * but let's check the common cases here: either completely * empty buffer (left == 0), or one with less than max. byte * count for a single char, and starting of a multi-byte * encoding (this leaves possibility of a 2/3-byte char * that is still fully accessible... but that can be checked * by the load method) */ if (left < 4) { // Need to load more? if (left < 1 || mByteBuffer[mBytePtr] < 0) { if (!loadMore(left)) { // (legal) EOF? return -1; } } } } /* This may look silly, but using a local var is indeed faster * (if and when HotSpot properly gets things running) than * member variable... */ byte[] buf = mByteBuffer; int inPtr = mBytePtr; int inBufLen = mByteBufferEnd; main_loop: while (outPtr < len) { // At this point we have at least one byte available int c = (int) buf[inPtr++]; // Let's first do the quickie loop for common case; 7-bit ascii: if (c >= 0) { // ascii? can probably loop, then if (c == 0x7F && mXml11) { // DEL illegal in xml1.1 int bytePos = mByteCount + inPtr - 1; int charPos = mCharCount + (outPtr-start); reportInvalidXml11(c, bytePos, charPos); } cbuf[outPtr++] = (char) c; // ok since MSB is never on /* Ok, how many such chars could we safely process * without overruns? (will combine 2 in-loop comparisons * into just one) */ int outMax = (len - outPtr); // max output int inMax = (inBufLen - inPtr); // max input int inEnd = inPtr + ((inMax < outMax) ? inMax : outMax); ascii_loop: while (true) { if (inPtr >= inEnd) { break main_loop; } c = ((int) buf[inPtr++]) & 0xFF; if (c >= 0x7F) { // DEL, or multi-byte break ascii_loop; } cbuf[outPtr++] = (char) c; } if (c == 0x7F) { if (mXml11) { // DEL illegal in xml1.1 int bytePos = mByteCount + inPtr - 1; int charPos = mCharCount + (outPtr-start); reportInvalidXml11(c, bytePos, charPos); } // but not in xml 1.0 cbuf[outPtr++] = (char) c; if(inPtr >= inEnd){ break main_loop; } continue main_loop; } } int needed; // Ok; if we end here, we got multi-byte combination if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c = (c & 0x1F); needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c = (c & 0x0F); needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char BS, with surrogates and all... c = (c & 0x0F); needed = 3; } else { reportInvalidInitial(c & 0xFF, outPtr-start); // never gets here... needed = 1; } /* Do we have enough bytes? If not, let's just push back the * byte and leave, since we have already gotten at least one * char decoded. This way we will only block (with read from * input stream) when absolutely necessary. */ if ((inBufLen - inPtr) < needed) { --inPtr; break main_loop; } int d = (int) buf[inPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, outPtr-start); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total d = buf[inPtr++]; // 3rd byte if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, outPtr-start); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) d = buf[inPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, outPtr-start); } c = (c << 6) | (d & 0x3F); if (c > XmlConsts.MAX_UNICODE_CHAR) { reportInvalid(c, outPtr-start, "(above "+Integer.toHexString(XmlConsts.MAX_UNICODE_CHAR)+") "); } /* Ugh. Need to mess with surrogates. Ok; let's inline them * there, then, if there's room: if only room for one, * need to save the surrogate for the rainy day... */ c -= 0x10000; // to normalize it starting with 0x0 cbuf[outPtr++] = (char) (0xD800 + (c >> 10)); // hmmh. can this ever be 0? (not legal, at least?) c = (0xDC00 | (c & 0x03FF)); // Room for second part? if (outPtr >= len) { // nope mSurrogate = (char) c; break main_loop; } // sure, let's fall back to normal processing: } else { /* Otherwise, need to check that 3-byte chars are * legal ones (should not expand to surrogates; * 0xFFFE and 0xFFFF are illegal) */ if (c >= 0xD800) { // But first, let's check max chars: if (c < 0xE000) { reportInvalid(c, outPtr-start, "(a surrogate character) "); } else if (c >= 0xFFFE) { reportInvalid(c, outPtr-start, ""); } } else if (mXml11 && c == 0x2028) { // LSEP? /* 10-May-2006, TSa: Since LSEP is "non-associative", * it needs additional handling. One way to do * this is to convert preceding \r to \n. This * should be implemented better when integrating * decoder and tokenizer. */ if (outPtr > start && cbuf[outPtr-1] == '\r') { cbuf[outPtr-1] = '\n'; } c = CONVERT_LSEP_TO; } } } else { // (needed == 1) if (mXml11) { // high-order ctrl char detection... if (c <= 0x9F) { if (c == 0x85) { // NEL, let's convert? c = CONVERT_NEL_TO; } else if (c >= 0x7F) { // DEL, ctrl chars int bytePos = mByteCount + inPtr - 1; int charPos = mCharCount + (outPtr-start); reportInvalidXml11(c, bytePos, charPos); } } } } cbuf[outPtr++] = (char) c; if (inPtr >= inBufLen) { break main_loop; } } mBytePtr = inPtr; len = outPtr - start; mCharCount += len; return len; } /* //////////////////////////////////////// // Internal methods //////////////////////////////////////// */ private void reportInvalidInitial(int mask, int offset) throws IOException { // input (byte) ptr has been advanced by one, by now: int bytePos = mByteCount + mBytePtr - 1; int charPos = mCharCount + offset + 1; throw new CharConversionException("Invalid UTF-8 start byte 0x" +Integer.toHexString(mask) +" (at char #"+charPos+", byte #"+bytePos+")"); } private void reportInvalidOther(int mask, int offset) throws IOException { int bytePos = mByteCount + mBytePtr - 1; int charPos = mCharCount + offset; throw new CharConversionException("Invalid UTF-8 middle byte 0x" +Integer.toHexString(mask) +" (at char #"+charPos+", byte #"+bytePos+")"); } private void reportUnexpectedEOF(int gotBytes, int needed) throws IOException { int bytePos = mByteCount + gotBytes; int charPos = mCharCount; throw new CharConversionException("Unexpected EOF in the middle of a multi-byte char: got " +gotBytes+", needed "+needed +", at char #"+charPos+", byte #"+bytePos+")"); } private void reportInvalid(int value, int offset, String msg) throws IOException { int bytePos = mByteCount + mBytePtr - 1; int charPos = mCharCount + offset; throw new CharConversionException("Invalid UTF-8 character 0x" +Integer.toHexString(value)+msg +" at char #"+charPos+", byte #"+bytePos+")"); } /** * @param available Number of "unused" bytes in the input buffer * * @return True, if enough bytes were read to allow decoding of at least * one full character; false if EOF was encountered instead. */ private boolean loadMore(int available) throws IOException { mByteCount += (mByteBufferEnd - available); // Bytes that need to be moved to the beginning of buffer? if (available > 0) { // 11-Nov-2008, TSa: can only move if we own the buffer; otherwise // we are stuck with the data. if (mBytePtr > 0 && canModifyBuffer()) { for (int i = 0; i < available; ++i) { mByteBuffer[i] = mByteBuffer[mBytePtr+i]; } mBytePtr = 0; mByteBufferEnd = available; } } else { // Ok; here we can actually reasonably expect an EOF, // so let's do a separate read right away: int count = readBytes(); if (count < 1) { if (count < 0) { // -1 freeBuffers(); // to help GC? return false; } // 0 count is no good; let's err out reportStrangeStream(); } } // We now have at least one byte... and that allows us to // calculate exactly how many bytes we need! @SuppressWarnings("cast") int c = (int) mByteBuffer[mBytePtr]; if (c >= 0) { // single byte (ascii) char... cool, can return return true; } // Ok, a multi-byte char, let's check how many bytes we'll need: int needed; if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) needed = 2; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) needed = 3; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char BS, with surrogates and all... needed = 4; } else { reportInvalidInitial(c & 0xFF, 0); // never gets here... but compiler whines without this: needed = 1; } /* And then we'll just need to load up to that many bytes; * if an EOF is hit, that'll be an error. But we need not do * actual decoding here, just load enough bytes. */ while ((mBytePtr + needed) > mByteBufferEnd) { int count = readBytesAt(mByteBufferEnd); if (count < 1) { if (count < 0) { // -1, EOF... no good! freeBuffers(); reportUnexpectedEOF(mByteBufferEnd, needed); } // 0 count is no good; let's err out reportStrangeStream(); } } return true; } }
⏎ com/ctc/wstx/io/UTF8Reader.java
Or download all of them as a single archive file:
File name: woodstox-core-6.4.0-fyi.zip File size: 552992 bytes Release date: 2022-10-25 Download
⇒ woodstox-core-6.4.0.jar - Woodstox Core 6.4.0
⇐ What Is Woodstox XML Processing
2023-01-29, 3368👍, 0💬
Popular Posts:
commons-io-2.6-sources.j aris the source JAR file for Apache Commons IO 2.6, which is a library of u...
What Is HttpComponents httpclient-4.2.2.jar? HttpComponents httpclient-4.2.2.jar is the JAR file for...
JDK 11 jdk.internal.vm.ci.jmod is the JMOD file for JDK 11 Internal VM CI module. JDK 11 Internal VM...
commons-collections4-4.4 -sources.jaris the source JAR file for Apache Commons Collections 4.2, whic...
How to download and install mysql-connector-j-8.0.31 .zip?Connector/J Java library is a JDBC Driver ...