001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements. See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership. The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the  "License");
007     * you may not use this file except in compliance with the License.
008     * You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    /*
019     * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
020     */
021    package org.apache.xml.serializer;
022    
023    
024    /**
025     * Holds information about a given encoding, which is the Java name for the
026     * encoding, the equivalent ISO name.
027     * <p>
028     * An object of this type has two useful methods
029     * <pre>
030     * isInEncoding(char ch);
031     * </pre>
032     * which can be called if the character is not the high one in
033     * a surrogate pair and:
034     * <pre>
035     * isInEncoding(char high, char low);
036     * </pre>
037     * which can be called if the two characters from a high/low surrogate pair.
038     * <p>
039     * An EncodingInfo object is a node in a binary search tree. Such a node
040     * will answer if a character is in the encoding, and do so for a given
041     * range of unicode values (<code>m_first</code> to
042     * <code>m_last</code>). It will handle a certain range of values
043     * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
044     * If the unicode point is before that explicit range, that is it
045     * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
046     * of such a tree, m_before.  Likewise for values in the range 
047     * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
048     * <p>
049     * Actually figuring out if a code point is in the encoding is expensive. So the
050     * purpose of this tree is to cache such determinations, and not to build the
051     * entire tree of information at the start, but only build up as much of the 
052     * tree as is used during the transformation.
053     * <p>
054     * This Class is not a public API, and should only be used internally within
055     * the serializer.
056     * <p>
057     * This class is not a public API.
058     * @xsl.usage internal
059     */
060    public final class EncodingInfo extends Object
061    {
062    
063        /**
064         * Not all characters in an encoding are in on contiguous group,
065         * however there is a lowest contiguous group starting at '\u0001'
066         * and working up to m_highCharInContiguousGroup.
067         * <p>
068         * This is the char for which chars at or below this value are 
069         * definately in the encoding, although for chars
070         * above this point they might be in the encoding.
071         * This exists for performance, especially for ASCII characters
072         * because for ASCII all chars in the range '\u0001' to '\u007F' 
073         * are in the encoding.
074         * 
075         */
076        private final char m_highCharInContiguousGroup;
077    
078        /**
079         * The ISO encoding name.
080         */
081        final String name;
082    
083        /**
084         * The name used by the Java convertor.
085         */
086        final String javaName;
087        
088        /**
089         * A helper object that we can ask if a
090         * single char, or a surrogate UTF-16 pair
091         * of chars that form a single character,
092         * is in this encoding.
093         */
094        private InEncoding m_encoding;
095        
096        /**
097         * This is not a public API. It returns true if the
098         * char in question is in the encoding.
099         * @param ch the char in question.
100         * <p>
101         * This method is not a public API.
102         * @xsl.usage internal
103         */
104        public boolean isInEncoding(char ch) {
105            if (m_encoding == null) {
106                m_encoding = new EncodingImpl();
107                
108                // One could put alternate logic in here to
109                // instantiate another object that implements the
110                // InEncoding interface. For example if the JRE is 1.4 or up
111                // we could have an object that uses JRE 1.4 methods
112            }
113            return m_encoding.isInEncoding(ch); 
114        }
115        
116        /**
117         * This is not a public API. It returns true if the
118         * character formed by the high/low pair is in the encoding.
119         * @param high a char that the a high char of a high/low surrogate pair.
120         * @param low a char that is the low char of a high/low surrogate pair.
121         * <p>
122         * This method is not a public API.
123         * @xsl.usage internal
124         */
125        public boolean isInEncoding(char high, char low) {
126            if (m_encoding == null) {
127                m_encoding = new EncodingImpl();
128                
129                // One could put alternate logic in here to
130                // instantiate another object that implements the
131                // InEncoding interface. For example if the JRE is 1.4 or up
132                // we could have an object that uses JRE 1.4 methods
133            }
134            return m_encoding.isInEncoding(high, low); 
135        }
136    
137        /**
138         * Create an EncodingInfo object based on the ISO name and Java name.
139         * If both parameters are null any character will be considered to
140         * be in the encoding. This is useful for when the serializer is in
141         * temporary output state, and has no assciated encoding.
142         *
143         * @param name reference to the ISO name.
144         * @param javaName reference to the Java encoding name.
145         * @param highChar The char for which characters at or below this value are 
146         * definately in the
147         * encoding, although for characters above this point they might be in the encoding.
148         */
149        public EncodingInfo(String name, String javaName, char highChar)
150        {
151    
152            this.name = name;
153            this.javaName = javaName;
154            this.m_highCharInContiguousGroup = highChar;
155        }
156        
157        
158        
159        /**
160         * A simple interface to isolate the implementation.
161         * We could also use some new JRE 1.4 methods in another implementation
162         * provided we use reflection with them.
163         * <p>
164         * This interface is not a public API,
165         * and should only be used internally within the serializer. 
166         * @xsl.usage internal
167         */
168        private interface InEncoding {
169            /**
170             * Returns true if the char is in the encoding
171             */
172            public boolean isInEncoding(char ch);
173            /**
174             * Returns true if the high/low surrogate pair forms
175             * a character that is in the encoding.
176             */
177            public boolean isInEncoding(char high, char low);
178        }
179    
180        /**
181         * This class implements the 
182         */
183        private class EncodingImpl implements InEncoding {
184            
185    
186    
187            public boolean isInEncoding(char ch1) {
188                final boolean ret;
189                int codePoint = Encodings.toCodePoint(ch1);
190                if (codePoint < m_explFirst) {
191                    // The unicode value is before the range
192                    // that we explictly manage, so we delegate the answer.
193                    
194                    // If we don't have an m_before object to delegate to, make one.
195                    if (m_before == null)
196                        m_before =
197                            new EncodingImpl(
198                                m_encoding,
199                                m_first,
200                                m_explFirst - 1,
201                                codePoint);
202                    ret = m_before.isInEncoding(ch1);
203                } else if (m_explLast < codePoint) {
204                    // The unicode value is after the range
205                    // that we explictly manage, so we delegate the answer.
206                    
207                    // If we don't have an m_after object to delegate to, make one.
208                    if (m_after == null)
209                        m_after =
210                            new EncodingImpl(
211                                m_encoding,
212                                m_explLast + 1,
213                                m_last,
214                                codePoint);
215                    ret = m_after.isInEncoding(ch1);
216                } else {
217                    // The unicode value is in the range we explitly handle
218                    final int idx = codePoint - m_explFirst;
219                    
220                    // If we already know the answer, just return it.
221                    if (m_alreadyKnown[idx])
222                        ret = m_isInEncoding[idx];
223                    else {
224                        // We don't know the answer, so find out,
225                        // which may be expensive, then cache the answer 
226                        ret = inEncoding(ch1, m_encoding);
227                        m_alreadyKnown[idx] = true;
228                        m_isInEncoding[idx] = ret;
229                    }
230                }
231                return ret;
232            }
233    
234            public boolean isInEncoding(char high, char low) {
235                final boolean ret;
236                int codePoint = Encodings.toCodePoint(high,low);
237                if (codePoint < m_explFirst) {
238                    // The unicode value is before the range
239                    // that we explictly manage, so we delegate the answer.
240                    
241                    // If we don't have an m_before object to delegate to, make one.
242                    if (m_before == null)
243                        m_before =
244                            new EncodingImpl(
245                                m_encoding,
246                                m_first,
247                                m_explFirst - 1,
248                                codePoint);
249                    ret = m_before.isInEncoding(high,low);
250                } else if (m_explLast < codePoint) {
251                    // The unicode value is after the range
252                    // that we explictly manage, so we delegate the answer.
253                    
254                    // If we don't have an m_after object to delegate to, make one.
255                    if (m_after == null)
256                        m_after =
257                            new EncodingImpl(
258                                m_encoding,
259                                m_explLast + 1,
260                                m_last,
261                                codePoint);
262                    ret = m_after.isInEncoding(high,low);
263                } else {
264                    // The unicode value is in the range we explitly handle
265                    final int idx = codePoint - m_explFirst;
266                    
267                    // If we already know the answer, just return it.
268                    if (m_alreadyKnown[idx])
269                        ret = m_isInEncoding[idx];
270                    else {
271                        // We don't know the answer, so find out,
272                        // which may be expensive, then cache the answer 
273                        ret = inEncoding(high, low, m_encoding);
274                        m_alreadyKnown[idx] = true;
275                        m_isInEncoding[idx] = ret;
276                    }
277                }
278                return ret;
279            }
280    
281            /**
282             * The encoding.
283             */
284            final private String m_encoding;
285            /**
286             * m_first through m_last is the range of unicode
287             * values that this object will return an answer on.
288             * It may delegate to a similar object with a different
289             * range
290             */
291            final private int m_first;
292            
293            /**
294             * m_explFirst through m_explLast is the range of unicode
295             * value that this object handles explicitly and does not
296             * delegate to a similar object.
297             */
298            final private int m_explFirst;
299            final private int m_explLast;
300            final private int m_last;
301    
302            /**
303             * The object, of the same type as this one,
304             * that handles unicode values in a range before
305             * the range explictly handled by this object, and
306             * to which this object may delegate.
307             */
308            private InEncoding m_before;
309            /**
310             * The object, of the same type as this one,
311             * that handles unicode values in a range after
312             * the range explictly handled by this object, and
313             * to which this object may delegate.
314             */
315            private InEncoding m_after;
316            
317            /**
318             * The number of unicode values explicitly handled
319             * by a single EncodingInfo object. This value is 
320             * tuneable, but is set to 128 because that covers the
321             * entire low range of ASCII type chars within a single
322             * object.
323             */
324            private static final int RANGE = 128;
325    
326            /**
327             * A flag to record if we already know the answer
328             * for the given unicode value.
329             */
330            final private boolean m_alreadyKnown[] = new boolean[RANGE];
331            /**
332             * A table holding the answer on whether the given unicode
333             * value is in the encoding.
334             */
335            final private boolean m_isInEncoding[] = new boolean[RANGE];
336            
337            private EncodingImpl() {
338                // This object will answer whether any unicode value
339                // is in the encoding, it handles values 0 through Integer.MAX_VALUE
340                this(javaName, 0, Integer.MAX_VALUE, (char) 0);
341            }
342    
343            private EncodingImpl(String encoding, int first, int last, int codePoint) {
344                // Set the range of unicode values that this object manages
345                // either explicitly or implicitly.
346                m_first = first;
347                m_last = last;  
348                          
349                // Set the range of unicode values that this object 
350                // explicitly manages
351                m_explFirst = codePoint;
352                m_explLast = codePoint + (RANGE-1);  
353                
354                m_encoding = encoding;
355                
356                if (javaName != null)
357                {
358                    // Some optimization.
359                    if (0 <= m_explFirst && m_explFirst <= 127) {
360                        // This particular EncodingImpl explicitly handles
361                        // characters in the low range.
362                        if ("UTF8".equals(javaName)
363                            || "UTF-16".equals(javaName)
364                            || "ASCII".equals(javaName)
365                            || "US-ASCII".equals(javaName)
366                            || "Unicode".equals(javaName)
367                            || "UNICODE".equals(javaName)
368                            || javaName.startsWith("ISO8859")) {
369                            
370                            // Not only does this EncodingImpl object explicitly
371                            // handle chracters in the low range, it is
372                            // also one that we know something about, without
373                            // needing to call inEncoding(char ch, String encoding)
374                            // for this low range
375                            //
376                            // By initializing the table ahead of time
377                            // for these low values, we prevent the expensive
378                            // inEncoding(char ch, String encoding)
379                            // from being called, at least for these common
380                            // encodings.
381                            for (int unicode = 1; unicode < 127; unicode++) {
382                                final int idx = unicode - m_explFirst;
383                                if (0 <= idx && idx < RANGE) {
384                                    m_alreadyKnown[idx] = true;
385                                    m_isInEncoding[idx] = true;
386                                }
387                            }
388                        }
389                    }
390    
391                    /* A little bit more than optimization.
392                     * 
393                     * We will say that any character is in the encoding if
394                     * we don't have an encoding.
395                     * This is meaningful when the serializer is being used
396                     * in temporary output state, where we are not writing to
397                     * the final output tree.  It is when writing to the
398                     * final output tree that we need to worry about the output
399                     * encoding
400                     */
401                    if (javaName == null) {
402                        for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
403                            m_alreadyKnown[idx] = true;
404                            m_isInEncoding[idx] = true;
405                        }
406                    }
407                }
408            }
409        }
410    
411        /**
412         * This is heart of the code that determines if a given character
413         * is in the given encoding. This method is probably expensive,
414         * and the answer should be cached.
415         * <p>
416         * This method is not a public API,
417         * and should only be used internally within the serializer.
418         * @param ch the char in question, that is not a high char of
419         * a high/low surrogate pair.
420         * @param encoding the Java name of the enocding.
421         * 
422         * @xsl.usage internal
423         * 
424         */
425        private static boolean inEncoding(char ch, String encoding) {
426            boolean isInEncoding;
427            try {
428                char cArray[] = new char[1];
429                cArray[0] = ch;
430                // Construct a String from the char 
431                String s = new String(cArray);
432                // Encode the String into a sequence of bytes 
433                // using the given, named charset. 
434                byte[] bArray = s.getBytes(encoding);
435                isInEncoding = inEncoding(ch, bArray);
436    
437            } catch (Exception e) {
438                isInEncoding = false;
439                
440                // If for some reason the encoding is null, e.g.
441                // for a temporary result tree, we should just
442                // say that every character is in the encoding.
443                if (encoding == null)
444                    isInEncoding = true;
445            }
446            return isInEncoding;
447        }
448        
449        /**
450         * This is heart of the code that determines if a given high/low
451         * surrogate pair forms a character that is in the given encoding.
452         * This method is probably expensive, and the answer should be cached. 
453         * <p>
454         * This method is not a public API,
455         * and should only be used internally within the serializer.
456         * @param high the high char of
457         * a high/low surrogate pair.
458         * @param low the low char of a high/low surrogate pair.
459         * @param encoding the Java name of the encoding.
460         * 
461         * @xsl.usage internal
462         * 
463         */ 
464        private static boolean inEncoding(char high, char low, String encoding) {
465            boolean isInEncoding;
466            try {
467                char cArray[] = new char[2];
468                cArray[0] = high;
469                cArray[1] = low;
470                // Construct a String from the char 
471                String s = new String(cArray);
472                // Encode the String into a sequence of bytes 
473                // using the given, named charset. 
474                byte[] bArray = s.getBytes(encoding);
475                isInEncoding = inEncoding(high,bArray);
476            } catch (Exception e) {
477                isInEncoding = false;
478            }
479            
480            return isInEncoding;
481        } 
482        
483        /**
484         * This method is the core of determining if character
485         * is in the encoding. The method is not foolproof, because
486         * s.getBytes(encoding) has specified behavior only if the
487         * characters are in the specified encoding. However this
488         * method tries it's best.
489         * @param ch the char that was converted using getBytes, or
490         * the first char of a high/low pair that was converted.
491         * @param data the bytes written out by the call to s.getBytes(encoding);
492         * @return true if the character is in the encoding.
493         */
494        private static boolean inEncoding(char ch, byte[] data) {
495            final boolean isInEncoding;
496            // If the string written out as data is not in the encoding,
497            // the output is not specified according to the documentation
498            // on the String.getBytes(encoding) method,
499            // but we do our best here.        
500            if (data==null || data.length == 0) {
501                isInEncoding = false;
502            }
503            else {
504                if (data[0] == 0)
505                    isInEncoding = false;
506                else if (data[0] == '?' && ch != '?')
507                    isInEncoding = false;
508                /*
509                 * else if (isJapanese) {
510                 *   // isJapanese is really 
511                 *   //   (    "EUC-JP".equals(javaName) 
512                 *   //    ||  "EUC_JP".equals(javaName)
513                 *  //     ||  "SJIS".equals(javaName)   )
514                 * 
515                 *   // Work around some bugs in JRE for Japanese
516                 *   if(data[0] == 0x21)
517                 *     isInEncoding = false;
518                 *   else if (ch == 0xA5)
519                 *     isInEncoding = false;
520                 *   else
521                 *     isInEncoding = true;
522                 * }
523                 */ 
524                    
525                else {
526                    // We don't know for sure, but it looks like it is in the encoding
527                    isInEncoding = true; 
528                }
529            }
530            return isInEncoding;
531        }
532        
533        /**
534         * This method exists for performance reasons.
535         * <p>
536         * Except for '\u0000', if a char is less than or equal to the value
537         * returned by this method then it in the encoding.
538         * <p>
539         * The characters in an encoding are not contiguous, however
540         * there is a lowest group of chars starting at '\u0001' upto and
541         * including the char returned by this method that are all in the encoding.
542         * So the char returned by this method essentially defines the lowest
543         * contiguous group.
544         * <p>
545         * chars above the value returned might be in the encoding, but 
546         * chars at or below the value returned are definately in the encoding.
547         * <p>
548         * In any case however, the isInEncoding(char) method can be used
549         * regardless of the value of the char returned by this method.
550         * <p>
551         * If the value returned is '\u0000' it means that every character must be tested
552         * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)} 
553         * for surrogate pairs.
554         * <p>
555         * This method is not a public API.
556         * @xsl.usage internal
557         */
558        public final char getHighChar() {
559            return m_highCharInContiguousGroup;
560        }
561    
562    }