001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 /*
019 * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $
020 */
021 package org.apache.xml.serializer;
022
023 import java.io.IOException;
024 import java.io.OutputStream;
025 import java.io.UnsupportedEncodingException;
026 import java.io.Writer;
027
028
029 /**
030 * This class writes unicode characters to a byte stream (java.io.OutputStream)
031 * as quickly as possible. It buffers the output in an internal
032 * buffer which must be flushed to the OutputStream when done. This flushing
033 * is done via the close() flush() or flushBuffer() method.
034 *
035 * This class is only used internally within Xalan.
036 *
037 * @xsl.usage internal
038 */
039 final class WriterToUTF8Buffered extends Writer implements WriterChain
040 {
041
042 /** number of bytes that the byte buffer can hold.
043 * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
044 */
045 private static final int BYTES_MAX=16*1024;
046 /** number of characters that the character buffer can hold.
047 * This is 1/3 of the number of bytes because UTF-8 encoding
048 * can expand one unicode character by up to 3 bytes.
049 */
050 private static final int CHARS_MAX=(BYTES_MAX/3);
051
052 // private static final int
053
054 /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
055 private final OutputStream m_os;
056
057 /**
058 * The internal buffer where data is stored.
059 * (sc & sb remove final to compile in JDK 1.1.8)
060 */
061 private final byte m_outputBytes[];
062
063 private final char m_inputChars[];
064
065 /**
066 * The number of valid bytes in the buffer. This value is always
067 * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
068 * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
069 * byte data.
070 */
071 private int count;
072
073 /**
074 * Create an buffered UTF-8 writer.
075 *
076 *
077 * @param out the underlying output stream.
078 *
079 * @throws UnsupportedEncodingException
080 */
081 public WriterToUTF8Buffered(OutputStream out)
082 {
083 m_os = out;
084 // get 3 extra bytes to make buffer overflow checking simpler and faster
085 // we won't have to keep checking for a few extra characters
086 m_outputBytes = new byte[BYTES_MAX + 3];
087
088 // Big enough to hold the input chars that will be transformed
089 // into output bytes in m_ouputBytes.
090 m_inputChars = new char[CHARS_MAX + 2];
091 count = 0;
092
093 // the old body of this constructor, before the buffersize was changed to a constant
094 // this(out, 8*1024);
095 }
096
097 /**
098 * Create an buffered UTF-8 writer to write data to the
099 * specified underlying output stream with the specified buffer
100 * size.
101 *
102 * @param out the underlying output stream.
103 * @param size the buffer size.
104 * @exception IllegalArgumentException if size <= 0.
105 */
106 // public WriterToUTF8Buffered(final OutputStream out, final int size)
107 // {
108 //
109 // m_os = out;
110 //
111 // if (size <= 0)
112 // {
113 // throw new IllegalArgumentException(
114 // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
115 // }
116 //
117 // m_outputBytes = new byte[size];
118 // count = 0;
119 // }
120
121 /**
122 * Write a single character. The character to be written is contained in
123 * the 16 low-order bits of the given integer value; the 16 high-order bits
124 * are ignored.
125 *
126 * <p> Subclasses that intend to support efficient single-character output
127 * should override this method.
128 *
129 * @param c int specifying a character to be written.
130 * @exception IOException If an I/O error occurs
131 */
132 public void write(final int c) throws IOException
133 {
134
135 /* If we are close to the end of the buffer then flush it.
136 * Remember the buffer can hold a few more bytes than BYTES_MAX
137 */
138 if (count >= BYTES_MAX)
139 flushBuffer();
140
141 if (c < 0x80)
142 {
143 m_outputBytes[count++] = (byte) (c);
144 }
145 else if (c < 0x800)
146 {
147 m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
148 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
149 }
150 else if (c < 0x10000)
151 {
152 m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
153 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
154 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
155 }
156 else
157 {
158 m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
159 m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
160 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
161 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
162 }
163
164 }
165
166
167 /**
168 * Write a portion of an array of characters.
169 *
170 * @param chars Array of characters
171 * @param start Offset from which to start writing characters
172 * @param length Number of characters to write
173 *
174 * @exception IOException If an I/O error occurs
175 *
176 * @throws java.io.IOException
177 */
178 public void write(final char chars[], final int start, final int length)
179 throws java.io.IOException
180 {
181
182 // We multiply the length by three since this is the maximum length
183 // of the characters that we can put into the buffer. It is possible
184 // for each Unicode character to expand to three bytes.
185
186 int lengthx3 = 3*length;
187
188 if (lengthx3 >= BYTES_MAX - count)
189 {
190 // The requested length is greater than the unused part of the buffer
191 flushBuffer();
192
193 if (lengthx3 > BYTES_MAX)
194 {
195 /*
196 * The requested length exceeds the size of the buffer.
197 * Cut the buffer up into chunks, each of which will
198 * not cause an overflow to the output buffer m_outputBytes,
199 * and make multiple recursive calls.
200 * Be careful about integer overflows in multiplication.
201 */
202 int split = length/CHARS_MAX;
203 final int chunks;
204 if (length % CHARS_MAX > 0)
205 chunks = split + 1;
206 else
207 chunks = split;
208 int end_chunk = start;
209 for (int chunk = 1; chunk <= chunks; chunk++)
210 {
211 int start_chunk = end_chunk;
212 end_chunk = start + (int) ((((long) length) * chunk) / chunks);
213
214 // Adjust the end of the chunk if it ends on a high char
215 // of a Unicode surrogate pair and low char of the pair
216 // is not going to be in the same chunk
217 final char c = chars[end_chunk - 1];
218 int ic = chars[end_chunk - 1];
219 if (c >= 0xD800 && c <= 0xDBFF) {
220 // The last Java char that we were going
221 // to process is the first of a
222 // Java surrogate char pair that
223 // represent a Unicode character.
224
225 if (end_chunk < start + length) {
226 // Avoid spanning by including the low
227 // char in the current chunk of chars.
228 end_chunk++;
229 } else {
230 /* This is the last char of the last chunk,
231 * and it is the high char of a high/low pair with
232 * no low char provided.
233 * TODO: error message needed.
234 * The char array incorrectly ends in a high char
235 * of a high/low surrogate pair, but there is
236 * no corresponding low as the high is the last char
237 */
238 end_chunk--;
239 }
240 }
241
242
243 int len_chunk = (end_chunk - start_chunk);
244 this.write(chars,start_chunk, len_chunk);
245 }
246 return;
247 }
248 }
249
250
251
252 final int n = length+start;
253 final byte[] buf_loc = m_outputBytes; // local reference for faster access
254 int count_loc = count; // local integer for faster access
255 int i = start;
256 {
257 /* This block could be omitted and the code would produce
258 * the same result. But this block exists to give the JIT
259 * a better chance of optimizing a tight and common loop which
260 * occurs when writing out ASCII characters.
261 */
262 char c;
263 for(; i < n && (c = chars[i])< 0x80 ; i++ )
264 buf_loc[count_loc++] = (byte)c;
265 }
266 for (; i < n; i++)
267 {
268
269 final char c = chars[i];
270
271 if (c < 0x80)
272 buf_loc[count_loc++] = (byte) (c);
273 else if (c < 0x800)
274 {
275 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
276 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
277 }
278 /**
279 * The following else if condition is added to support XML 1.1 Characters for
280 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
281 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
282 * [1101 11yy] [yyxx xxxx] (low surrogate)
283 * * uuuuu = wwww + 1
284 */
285 else if (c >= 0xD800 && c <= 0xDBFF)
286 {
287 char high, low;
288 high = c;
289 i++;
290 low = chars[i];
291
292 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
293 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
294 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
295 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
296 }
297 else
298 {
299 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
300 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
301 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
302 }
303 }
304 // Store the local integer back into the instance variable
305 count = count_loc;
306
307 }
308
309 /**
310 * Write a string.
311 *
312 * @param s String to be written
313 *
314 * @exception IOException If an I/O error occurs
315 */
316 public void write(final String s) throws IOException
317 {
318
319 // We multiply the length by three since this is the maximum length
320 // of the characters that we can put into the buffer. It is possible
321 // for each Unicode character to expand to three bytes.
322 final int length = s.length();
323 int lengthx3 = 3*length;
324
325 if (lengthx3 >= BYTES_MAX - count)
326 {
327 // The requested length is greater than the unused part of the buffer
328 flushBuffer();
329
330 if (lengthx3 > BYTES_MAX)
331 {
332 /*
333 * The requested length exceeds the size of the buffer,
334 * so break it up in chunks that don't exceed the buffer size.
335 */
336 final int start = 0;
337 int split = length/CHARS_MAX;
338 final int chunks;
339 if (length % CHARS_MAX > 0)
340 chunks = split + 1;
341 else
342 chunks = split;
343 int end_chunk = 0;
344 for (int chunk = 1; chunk <= chunks; chunk++)
345 {
346 int start_chunk = end_chunk;
347 end_chunk = start + (int) ((((long) length) * chunk) / chunks);
348 s.getChars(start_chunk,end_chunk, m_inputChars,0);
349 int len_chunk = (end_chunk - start_chunk);
350
351 // Adjust the end of the chunk if it ends on a high char
352 // of a Unicode surrogate pair and low char of the pair
353 // is not going to be in the same chunk
354 final char c = m_inputChars[len_chunk - 1];
355 if (c >= 0xD800 && c <= 0xDBFF) {
356 // Exclude char in this chunk,
357 // to avoid spanning a Unicode character
358 // that is in two Java chars as a high/low surrogate
359 end_chunk--;
360 len_chunk--;
361 if (chunk == chunks) {
362 /* TODO: error message needed.
363 * The String incorrectly ends in a high char
364 * of a high/low surrogate pair, but there is
365 * no corresponding low as the high is the last char
366 * Recover by ignoring this last char.
367 */
368 }
369 }
370
371 this.write(m_inputChars,0, len_chunk);
372 }
373 return;
374 }
375 }
376
377
378 s.getChars(0, length , m_inputChars, 0);
379 final char[] chars = m_inputChars;
380 final int n = length;
381 final byte[] buf_loc = m_outputBytes; // local reference for faster access
382 int count_loc = count; // local integer for faster access
383 int i = 0;
384 {
385 /* This block could be omitted and the code would produce
386 * the same result. But this block exists to give the JIT
387 * a better chance of optimizing a tight and common loop which
388 * occurs when writing out ASCII characters.
389 */
390 char c;
391 for(; i < n && (c = chars[i])< 0x80 ; i++ )
392 buf_loc[count_loc++] = (byte)c;
393 }
394 for (; i < n; i++)
395 {
396
397 final char c = chars[i];
398
399 if (c < 0x80)
400 buf_loc[count_loc++] = (byte) (c);
401 else if (c < 0x800)
402 {
403 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
404 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
405 }
406 /**
407 * The following else if condition is added to support XML 1.1 Characters for
408 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
409 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
410 * [1101 11yy] [yyxx xxxx] (low surrogate)
411 * * uuuuu = wwww + 1
412 */
413 else if (c >= 0xD800 && c <= 0xDBFF)
414 {
415 char high, low;
416 high = c;
417 i++;
418 low = chars[i];
419
420 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
421 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
422 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
423 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
424 }
425 else
426 {
427 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
428 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
429 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
430 }
431 }
432 // Store the local integer back into the instance variable
433 count = count_loc;
434
435 }
436
437 /**
438 * Flush the internal buffer
439 *
440 * @throws IOException
441 */
442 public void flushBuffer() throws IOException
443 {
444
445 if (count > 0)
446 {
447 m_os.write(m_outputBytes, 0, count);
448
449 count = 0;
450 }
451 }
452
453 /**
454 * Flush the stream. If the stream has saved any characters from the
455 * various write() methods in a buffer, write them immediately to their
456 * intended destination. Then, if that destination is another character or
457 * byte stream, flush it. Thus one flush() invocation will flush all the
458 * buffers in a chain of Writers and OutputStreams.
459 *
460 * @exception IOException If an I/O error occurs
461 *
462 * @throws java.io.IOException
463 */
464 public void flush() throws java.io.IOException
465 {
466 flushBuffer();
467 m_os.flush();
468 }
469
470 /**
471 * Close the stream, flushing it first. Once a stream has been closed,
472 * further write() or flush() invocations will cause an IOException to be
473 * thrown. Closing a previously-closed stream, however, has no effect.
474 *
475 * @exception IOException If an I/O error occurs
476 *
477 * @throws java.io.IOException
478 */
479 public void close() throws java.io.IOException
480 {
481 flushBuffer();
482 m_os.close();
483 }
484
485 /**
486 * Get the output stream where the events will be serialized to.
487 *
488 * @return reference to the result stream, or null of only a writer was
489 * set.
490 */
491 public OutputStream getOutputStream()
492 {
493 return m_os;
494 }
495
496 public Writer getWriter()
497 {
498 // Only one of getWriter() or getOutputStream() can return null
499 // This type of writer wraps an OutputStream, not a Writer.
500 return null;
501 }
502 }