001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements. See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership. The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the  "License");
007     * you may not use this file except in compliance with the License.
008     * You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    /*
019     * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $
020     */
021    package org.apache.xml.serializer;
022    
023    import java.io.IOException;
024    import java.io.OutputStream;
025    import java.io.UnsupportedEncodingException;
026    import java.io.Writer;
027    
028    
029    /**
030     * This class writes unicode characters to a byte stream (java.io.OutputStream)
031     * as quickly as possible. It buffers the output in an internal
032     * buffer which must be flushed to the OutputStream when done. This flushing
033     * is done via the close() flush() or flushBuffer() method. 
034     * 
035     * This class is only used internally within Xalan.
036     * 
037     * @xsl.usage internal
038     */
039    final class WriterToUTF8Buffered extends Writer implements WriterChain
040    {
041        
042      /** number of bytes that the byte buffer can hold.
043       * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
044       */
045      private static final int BYTES_MAX=16*1024;
046      /** number of characters that the character buffer can hold.
047       * This is 1/3 of the number of bytes because UTF-8 encoding
048       * can expand one unicode character by up to 3 bytes.
049       */
050      private static final int CHARS_MAX=(BYTES_MAX/3);
051      
052     // private static final int 
053      
054      /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
055      private final OutputStream m_os;
056    
057      /**
058       * The internal buffer where data is stored.
059       * (sc & sb remove final to compile in JDK 1.1.8)
060       */
061      private final byte m_outputBytes[];
062      
063      private final char m_inputChars[];
064    
065      /**
066       * The number of valid bytes in the buffer. This value is always
067       * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
068       * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
069       * byte data.
070       */
071      private int count;
072    
073      /**
074       * Create an buffered UTF-8 writer.
075       *
076       *
077       * @param   out    the underlying output stream.
078       *
079       * @throws UnsupportedEncodingException
080       */
081      public WriterToUTF8Buffered(OutputStream out)
082      {
083          m_os = out;
084          // get 3 extra bytes to make buffer overflow checking simpler and faster
085          // we won't have to keep checking for a few extra characters
086          m_outputBytes = new byte[BYTES_MAX + 3];
087          
088          // Big enough to hold the input chars that will be transformed
089          // into output bytes in m_ouputBytes.
090          m_inputChars = new char[CHARS_MAX + 2];
091          count = 0;
092          
093    //      the old body of this constructor, before the buffersize was changed to a constant      
094    //      this(out, 8*1024);
095      }
096    
097      /**
098       * Create an buffered UTF-8 writer to write data to the
099       * specified underlying output stream with the specified buffer
100       * size.
101       *
102       * @param   out    the underlying output stream.
103       * @param   size   the buffer size.
104       * @exception IllegalArgumentException if size <= 0.
105       */
106    //  public WriterToUTF8Buffered(final OutputStream out, final int size)
107    //  {
108    //
109    //    m_os = out;
110    //
111    //    if (size <= 0)
112    //    {
113    //      throw new IllegalArgumentException(
114    //        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
115    //    }
116    //
117    //    m_outputBytes = new byte[size];
118    //    count = 0;
119    //  }
120    
121      /**
122       * Write a single character.  The character to be written is contained in
123       * the 16 low-order bits of the given integer value; the 16 high-order bits
124       * are ignored.
125       *
126       * <p> Subclasses that intend to support efficient single-character output
127       * should override this method.
128       *
129       * @param c  int specifying a character to be written.
130       * @exception  IOException  If an I/O error occurs
131       */
132      public void write(final int c) throws IOException
133      {
134        
135        /* If we are close to the end of the buffer then flush it.
136         * Remember the buffer can hold a few more bytes than BYTES_MAX
137         */ 
138        if (count >= BYTES_MAX)
139            flushBuffer();
140    
141        if (c < 0x80)
142        {
143           m_outputBytes[count++] = (byte) (c);
144        }
145        else if (c < 0x800)
146        {
147          m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
148          m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
149        }
150        else if (c < 0x10000)
151        {
152          m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
153          m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
154          m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
155        }
156            else
157            {
158              m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
159              m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
160              m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
161              m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
162            }
163    
164      }
165    
166    
167      /**
168       * Write a portion of an array of characters.
169       *
170       * @param  chars  Array of characters
171       * @param  start   Offset from which to start writing characters
172       * @param  length   Number of characters to write
173       *
174       * @exception  IOException  If an I/O error occurs
175       *
176       * @throws java.io.IOException
177       */
178      public void write(final char chars[], final int start, final int length)
179              throws java.io.IOException
180      {
181    
182        // We multiply the length by three since this is the maximum length
183        // of the characters that we can put into the buffer.  It is possible
184        // for each Unicode character to expand to three bytes.
185    
186        int lengthx3 = 3*length;
187    
188        if (lengthx3 >= BYTES_MAX - count)
189        {
190          // The requested length is greater than the unused part of the buffer
191          flushBuffer();
192    
193          if (lengthx3 > BYTES_MAX)
194          {
195            /*
196             * The requested length exceeds the size of the buffer.
197             * Cut the buffer up into chunks, each of which will
198             * not cause an overflow to the output buffer m_outputBytes,
199             * and make multiple recursive calls.
200             * Be careful about integer overflows in multiplication.
201             */
202            int split = length/CHARS_MAX; 
203            final int chunks;
204            if (length % CHARS_MAX > 0) 
205                chunks = split + 1;
206            else
207                chunks = split;
208            int end_chunk = start;
209            for (int chunk = 1; chunk <= chunks; chunk++)
210            {
211                int start_chunk = end_chunk;
212                end_chunk = start + (int) ((((long) length) * chunk) / chunks);
213                
214                // Adjust the end of the chunk if it ends on a high char 
215                // of a Unicode surrogate pair and low char of the pair
216                // is not going to be in the same chunk
217                final char c = chars[end_chunk - 1]; 
218                int ic = chars[end_chunk - 1];
219                if (c >= 0xD800 && c <= 0xDBFF) {
220                    // The last Java char that we were going
221                    // to process is the first of a
222                    // Java surrogate char pair that
223                    // represent a Unicode character.
224    
225                    if (end_chunk < start + length) {
226                        // Avoid spanning by including the low
227                        // char in the current chunk of chars.
228                        end_chunk++;
229                    } else {
230                        /* This is the last char of the last chunk,
231                         * and it is the high char of a high/low pair with
232                         * no low char provided.
233                         * TODO: error message needed.
234                         * The char array incorrectly ends in a high char
235                         * of a high/low surrogate pair, but there is
236                         * no corresponding low as the high is the last char 
237                         */
238                        end_chunk--;
239                    }
240                }
241    
242    
243                int len_chunk = (end_chunk - start_chunk);
244                this.write(chars,start_chunk, len_chunk);
245            }
246            return;
247          }
248        }
249    
250    
251    
252        final int n = length+start;
253        final byte[] buf_loc = m_outputBytes; // local reference for faster access
254        int count_loc = count;      // local integer for faster access
255        int i = start;
256        {
257            /* This block could be omitted and the code would produce
258             * the same result. But this block exists to give the JIT
259             * a better chance of optimizing a tight and common loop which
260             * occurs when writing out ASCII characters. 
261             */ 
262            char c;
263            for(; i < n && (c = chars[i])< 0x80 ; i++ )
264                buf_loc[count_loc++] = (byte)c;
265        }
266        for (; i < n; i++)
267        {
268    
269          final char c = chars[i];
270    
271          if (c < 0x80)
272            buf_loc[count_loc++] = (byte) (c);
273          else if (c < 0x800)
274          {
275            buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
276            buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
277          }
278          /**
279            * The following else if condition is added to support XML 1.1 Characters for 
280            * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
281            * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
282            *          [1101 11yy] [yyxx xxxx] (low surrogate)
283            *          * uuuuu = wwww + 1
284            */
285          else if (c >= 0xD800 && c <= 0xDBFF) 
286          {
287              char high, low;
288              high = c;
289              i++;
290              low = chars[i];
291    
292              buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
293              buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
294              buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
295              buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
296          }
297          else
298          {
299            buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
300            buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
301            buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
302          }
303        }
304        // Store the local integer back into the instance variable
305        count = count_loc;
306    
307      }
308    
309      /**
310       * Write a string.
311       *
312       * @param  s  String to be written
313       *
314       * @exception  IOException  If an I/O error occurs
315       */
316      public void write(final String s) throws IOException
317      {
318    
319        // We multiply the length by three since this is the maximum length
320        // of the characters that we can put into the buffer.  It is possible
321        // for each Unicode character to expand to three bytes.
322        final int length = s.length();
323        int lengthx3 = 3*length;
324    
325        if (lengthx3 >= BYTES_MAX - count)
326        {
327          // The requested length is greater than the unused part of the buffer
328          flushBuffer();
329    
330          if (lengthx3 > BYTES_MAX)
331          {
332            /*
333             * The requested length exceeds the size of the buffer,
334             * so break it up in chunks that don't exceed the buffer size.
335             */
336             final int start = 0;
337             int split = length/CHARS_MAX; 
338             final int chunks;
339             if (length % CHARS_MAX > 0) 
340                 chunks = split + 1;
341             else
342                 chunks = split;
343             int end_chunk = 0;
344             for (int chunk = 1; chunk <= chunks; chunk++)
345             {
346                 int start_chunk = end_chunk;
347                 end_chunk = start + (int) ((((long) length) * chunk) / chunks);
348                 s.getChars(start_chunk,end_chunk, m_inputChars,0);
349                 int len_chunk = (end_chunk - start_chunk);
350    
351                 // Adjust the end of the chunk if it ends on a high char 
352                 // of a Unicode surrogate pair and low char of the pair
353                 // is not going to be in the same chunk
354                 final char c = m_inputChars[len_chunk - 1];
355                 if (c >= 0xD800 && c <= 0xDBFF) {
356                     // Exclude char in this chunk, 
357                     // to avoid spanning a Unicode character 
358                     // that is in two Java chars as a high/low surrogate
359                     end_chunk--;
360                     len_chunk--;
361                     if (chunk == chunks) {
362                         /* TODO: error message needed.
363                          * The String incorrectly ends in a high char
364                          * of a high/low surrogate pair, but there is
365                          * no corresponding low as the high is the last char
366                          * Recover by ignoring this last char.
367                          */
368                     }
369                 }
370    
371                 this.write(m_inputChars,0, len_chunk);
372             }
373             return;
374          }
375        }
376    
377    
378        s.getChars(0, length , m_inputChars, 0);
379        final char[] chars = m_inputChars;
380        final int n = length;
381        final byte[] buf_loc = m_outputBytes; // local reference for faster access
382        int count_loc = count;      // local integer for faster access
383        int i = 0;
384        {
385            /* This block could be omitted and the code would produce
386             * the same result. But this block exists to give the JIT
387             * a better chance of optimizing a tight and common loop which
388             * occurs when writing out ASCII characters. 
389             */ 
390            char c;
391            for(; i < n && (c = chars[i])< 0x80 ; i++ )
392                buf_loc[count_loc++] = (byte)c;
393        }
394        for (; i < n; i++)
395        {
396    
397          final char c = chars[i];
398    
399          if (c < 0x80)
400            buf_loc[count_loc++] = (byte) (c);
401          else if (c < 0x800)
402          {
403            buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
404            buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
405          }
406        /**
407          * The following else if condition is added to support XML 1.1 Characters for 
408          * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
409          * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
410          *          [1101 11yy] [yyxx xxxx] (low surrogate)
411          *          * uuuuu = wwww + 1
412          */
413        else if (c >= 0xD800 && c <= 0xDBFF) 
414        {
415            char high, low;
416            high = c;
417            i++;
418            low = chars[i];
419    
420            buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
421            buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
422            buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
423            buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
424        }
425          else
426          {
427            buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
428            buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
429            buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
430          }
431        }
432        // Store the local integer back into the instance variable
433        count = count_loc;
434    
435      }
436    
437      /**
438       * Flush the internal buffer
439       *
440       * @throws IOException
441       */
442      public void flushBuffer() throws IOException
443      {
444    
445        if (count > 0)
446        {
447          m_os.write(m_outputBytes, 0, count);
448    
449          count = 0;
450        }
451      }
452    
453      /**
454       * Flush the stream.  If the stream has saved any characters from the
455       * various write() methods in a buffer, write them immediately to their
456       * intended destination.  Then, if that destination is another character or
457       * byte stream, flush it.  Thus one flush() invocation will flush all the
458       * buffers in a chain of Writers and OutputStreams.
459       *
460       * @exception  IOException  If an I/O error occurs
461       *
462       * @throws java.io.IOException
463       */
464      public void flush() throws java.io.IOException
465      {
466        flushBuffer();
467        m_os.flush();
468      }
469    
470      /**
471       * Close the stream, flushing it first.  Once a stream has been closed,
472       * further write() or flush() invocations will cause an IOException to be
473       * thrown.  Closing a previously-closed stream, however, has no effect.
474       *
475       * @exception  IOException  If an I/O error occurs
476       *
477       * @throws java.io.IOException
478       */
479      public void close() throws java.io.IOException
480      {
481        flushBuffer();
482        m_os.close();
483      }
484    
485      /**
486       * Get the output stream where the events will be serialized to.
487       *
488       * @return reference to the result stream, or null of only a writer was
489       * set.
490       */
491      public OutputStream getOutputStream()
492      {
493        return m_os;
494      }
495    
496      public Writer getWriter()
497      {
498        // Only one of getWriter() or getOutputStream() can return null
499        // This type of writer wraps an OutputStream, not a Writer.
500        return null;
501      }
502    }