001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 /*
019 * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
020 */
021 package org.apache.xml.serializer;
022
023
024 /**
025 * Holds information about a given encoding, which is the Java name for the
026 * encoding, the equivalent ISO name.
027 * <p>
028 * An object of this type has two useful methods
029 * <pre>
030 * isInEncoding(char ch);
031 * </pre>
032 * which can be called if the character is not the high one in
033 * a surrogate pair and:
034 * <pre>
035 * isInEncoding(char high, char low);
036 * </pre>
037 * which can be called if the two characters from a high/low surrogate pair.
038 * <p>
039 * An EncodingInfo object is a node in a binary search tree. Such a node
040 * will answer if a character is in the encoding, and do so for a given
041 * range of unicode values (<code>m_first</code> to
042 * <code>m_last</code>). It will handle a certain range of values
043 * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
044 * If the unicode point is before that explicit range, that is it
045 * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
046 * of such a tree, m_before. Likewise for values in the range
047 * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
048 * <p>
049 * Actually figuring out if a code point is in the encoding is expensive. So the
050 * purpose of this tree is to cache such determinations, and not to build the
051 * entire tree of information at the start, but only build up as much of the
052 * tree as is used during the transformation.
053 * <p>
054 * This Class is not a public API, and should only be used internally within
055 * the serializer.
056 * <p>
057 * This class is not a public API.
058 * @xsl.usage internal
059 */
060 public final class EncodingInfo extends Object
061 {
062
063 /**
064 * Not all characters in an encoding are in on contiguous group,
065 * however there is a lowest contiguous group starting at '\u0001'
066 * and working up to m_highCharInContiguousGroup.
067 * <p>
068 * This is the char for which chars at or below this value are
069 * definately in the encoding, although for chars
070 * above this point they might be in the encoding.
071 * This exists for performance, especially for ASCII characters
072 * because for ASCII all chars in the range '\u0001' to '\u007F'
073 * are in the encoding.
074 *
075 */
076 private final char m_highCharInContiguousGroup;
077
078 /**
079 * The ISO encoding name.
080 */
081 final String name;
082
083 /**
084 * The name used by the Java convertor.
085 */
086 final String javaName;
087
088 /**
089 * A helper object that we can ask if a
090 * single char, or a surrogate UTF-16 pair
091 * of chars that form a single character,
092 * is in this encoding.
093 */
094 private InEncoding m_encoding;
095
096 /**
097 * This is not a public API. It returns true if the
098 * char in question is in the encoding.
099 * @param ch the char in question.
100 * <p>
101 * This method is not a public API.
102 * @xsl.usage internal
103 */
104 public boolean isInEncoding(char ch) {
105 if (m_encoding == null) {
106 m_encoding = new EncodingImpl();
107
108 // One could put alternate logic in here to
109 // instantiate another object that implements the
110 // InEncoding interface. For example if the JRE is 1.4 or up
111 // we could have an object that uses JRE 1.4 methods
112 }
113 return m_encoding.isInEncoding(ch);
114 }
115
116 /**
117 * This is not a public API. It returns true if the
118 * character formed by the high/low pair is in the encoding.
119 * @param high a char that the a high char of a high/low surrogate pair.
120 * @param low a char that is the low char of a high/low surrogate pair.
121 * <p>
122 * This method is not a public API.
123 * @xsl.usage internal
124 */
125 public boolean isInEncoding(char high, char low) {
126 if (m_encoding == null) {
127 m_encoding = new EncodingImpl();
128
129 // One could put alternate logic in here to
130 // instantiate another object that implements the
131 // InEncoding interface. For example if the JRE is 1.4 or up
132 // we could have an object that uses JRE 1.4 methods
133 }
134 return m_encoding.isInEncoding(high, low);
135 }
136
137 /**
138 * Create an EncodingInfo object based on the ISO name and Java name.
139 * If both parameters are null any character will be considered to
140 * be in the encoding. This is useful for when the serializer is in
141 * temporary output state, and has no assciated encoding.
142 *
143 * @param name reference to the ISO name.
144 * @param javaName reference to the Java encoding name.
145 * @param highChar The char for which characters at or below this value are
146 * definately in the
147 * encoding, although for characters above this point they might be in the encoding.
148 */
149 public EncodingInfo(String name, String javaName, char highChar)
150 {
151
152 this.name = name;
153 this.javaName = javaName;
154 this.m_highCharInContiguousGroup = highChar;
155 }
156
157
158
159 /**
160 * A simple interface to isolate the implementation.
161 * We could also use some new JRE 1.4 methods in another implementation
162 * provided we use reflection with them.
163 * <p>
164 * This interface is not a public API,
165 * and should only be used internally within the serializer.
166 * @xsl.usage internal
167 */
168 private interface InEncoding {
169 /**
170 * Returns true if the char is in the encoding
171 */
172 public boolean isInEncoding(char ch);
173 /**
174 * Returns true if the high/low surrogate pair forms
175 * a character that is in the encoding.
176 */
177 public boolean isInEncoding(char high, char low);
178 }
179
180 /**
181 * This class implements the
182 */
183 private class EncodingImpl implements InEncoding {
184
185
186
187 public boolean isInEncoding(char ch1) {
188 final boolean ret;
189 int codePoint = Encodings.toCodePoint(ch1);
190 if (codePoint < m_explFirst) {
191 // The unicode value is before the range
192 // that we explictly manage, so we delegate the answer.
193
194 // If we don't have an m_before object to delegate to, make one.
195 if (m_before == null)
196 m_before =
197 new EncodingImpl(
198 m_encoding,
199 m_first,
200 m_explFirst - 1,
201 codePoint);
202 ret = m_before.isInEncoding(ch1);
203 } else if (m_explLast < codePoint) {
204 // The unicode value is after the range
205 // that we explictly manage, so we delegate the answer.
206
207 // If we don't have an m_after object to delegate to, make one.
208 if (m_after == null)
209 m_after =
210 new EncodingImpl(
211 m_encoding,
212 m_explLast + 1,
213 m_last,
214 codePoint);
215 ret = m_after.isInEncoding(ch1);
216 } else {
217 // The unicode value is in the range we explitly handle
218 final int idx = codePoint - m_explFirst;
219
220 // If we already know the answer, just return it.
221 if (m_alreadyKnown[idx])
222 ret = m_isInEncoding[idx];
223 else {
224 // We don't know the answer, so find out,
225 // which may be expensive, then cache the answer
226 ret = inEncoding(ch1, m_encoding);
227 m_alreadyKnown[idx] = true;
228 m_isInEncoding[idx] = ret;
229 }
230 }
231 return ret;
232 }
233
234 public boolean isInEncoding(char high, char low) {
235 final boolean ret;
236 int codePoint = Encodings.toCodePoint(high,low);
237 if (codePoint < m_explFirst) {
238 // The unicode value is before the range
239 // that we explictly manage, so we delegate the answer.
240
241 // If we don't have an m_before object to delegate to, make one.
242 if (m_before == null)
243 m_before =
244 new EncodingImpl(
245 m_encoding,
246 m_first,
247 m_explFirst - 1,
248 codePoint);
249 ret = m_before.isInEncoding(high,low);
250 } else if (m_explLast < codePoint) {
251 // The unicode value is after the range
252 // that we explictly manage, so we delegate the answer.
253
254 // If we don't have an m_after object to delegate to, make one.
255 if (m_after == null)
256 m_after =
257 new EncodingImpl(
258 m_encoding,
259 m_explLast + 1,
260 m_last,
261 codePoint);
262 ret = m_after.isInEncoding(high,low);
263 } else {
264 // The unicode value is in the range we explitly handle
265 final int idx = codePoint - m_explFirst;
266
267 // If we already know the answer, just return it.
268 if (m_alreadyKnown[idx])
269 ret = m_isInEncoding[idx];
270 else {
271 // We don't know the answer, so find out,
272 // which may be expensive, then cache the answer
273 ret = inEncoding(high, low, m_encoding);
274 m_alreadyKnown[idx] = true;
275 m_isInEncoding[idx] = ret;
276 }
277 }
278 return ret;
279 }
280
281 /**
282 * The encoding.
283 */
284 final private String m_encoding;
285 /**
286 * m_first through m_last is the range of unicode
287 * values that this object will return an answer on.
288 * It may delegate to a similar object with a different
289 * range
290 */
291 final private int m_first;
292
293 /**
294 * m_explFirst through m_explLast is the range of unicode
295 * value that this object handles explicitly and does not
296 * delegate to a similar object.
297 */
298 final private int m_explFirst;
299 final private int m_explLast;
300 final private int m_last;
301
302 /**
303 * The object, of the same type as this one,
304 * that handles unicode values in a range before
305 * the range explictly handled by this object, and
306 * to which this object may delegate.
307 */
308 private InEncoding m_before;
309 /**
310 * The object, of the same type as this one,
311 * that handles unicode values in a range after
312 * the range explictly handled by this object, and
313 * to which this object may delegate.
314 */
315 private InEncoding m_after;
316
317 /**
318 * The number of unicode values explicitly handled
319 * by a single EncodingInfo object. This value is
320 * tuneable, but is set to 128 because that covers the
321 * entire low range of ASCII type chars within a single
322 * object.
323 */
324 private static final int RANGE = 128;
325
326 /**
327 * A flag to record if we already know the answer
328 * for the given unicode value.
329 */
330 final private boolean m_alreadyKnown[] = new boolean[RANGE];
331 /**
332 * A table holding the answer on whether the given unicode
333 * value is in the encoding.
334 */
335 final private boolean m_isInEncoding[] = new boolean[RANGE];
336
337 private EncodingImpl() {
338 // This object will answer whether any unicode value
339 // is in the encoding, it handles values 0 through Integer.MAX_VALUE
340 this(javaName, 0, Integer.MAX_VALUE, (char) 0);
341 }
342
343 private EncodingImpl(String encoding, int first, int last, int codePoint) {
344 // Set the range of unicode values that this object manages
345 // either explicitly or implicitly.
346 m_first = first;
347 m_last = last;
348
349 // Set the range of unicode values that this object
350 // explicitly manages
351 m_explFirst = codePoint;
352 m_explLast = codePoint + (RANGE-1);
353
354 m_encoding = encoding;
355
356 if (javaName != null)
357 {
358 // Some optimization.
359 if (0 <= m_explFirst && m_explFirst <= 127) {
360 // This particular EncodingImpl explicitly handles
361 // characters in the low range.
362 if ("UTF8".equals(javaName)
363 || "UTF-16".equals(javaName)
364 || "ASCII".equals(javaName)
365 || "US-ASCII".equals(javaName)
366 || "Unicode".equals(javaName)
367 || "UNICODE".equals(javaName)
368 || javaName.startsWith("ISO8859")) {
369
370 // Not only does this EncodingImpl object explicitly
371 // handle chracters in the low range, it is
372 // also one that we know something about, without
373 // needing to call inEncoding(char ch, String encoding)
374 // for this low range
375 //
376 // By initializing the table ahead of time
377 // for these low values, we prevent the expensive
378 // inEncoding(char ch, String encoding)
379 // from being called, at least for these common
380 // encodings.
381 for (int unicode = 1; unicode < 127; unicode++) {
382 final int idx = unicode - m_explFirst;
383 if (0 <= idx && idx < RANGE) {
384 m_alreadyKnown[idx] = true;
385 m_isInEncoding[idx] = true;
386 }
387 }
388 }
389 }
390
391 /* A little bit more than optimization.
392 *
393 * We will say that any character is in the encoding if
394 * we don't have an encoding.
395 * This is meaningful when the serializer is being used
396 * in temporary output state, where we are not writing to
397 * the final output tree. It is when writing to the
398 * final output tree that we need to worry about the output
399 * encoding
400 */
401 if (javaName == null) {
402 for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
403 m_alreadyKnown[idx] = true;
404 m_isInEncoding[idx] = true;
405 }
406 }
407 }
408 }
409 }
410
411 /**
412 * This is heart of the code that determines if a given character
413 * is in the given encoding. This method is probably expensive,
414 * and the answer should be cached.
415 * <p>
416 * This method is not a public API,
417 * and should only be used internally within the serializer.
418 * @param ch the char in question, that is not a high char of
419 * a high/low surrogate pair.
420 * @param encoding the Java name of the enocding.
421 *
422 * @xsl.usage internal
423 *
424 */
425 private static boolean inEncoding(char ch, String encoding) {
426 boolean isInEncoding;
427 try {
428 char cArray[] = new char[1];
429 cArray[0] = ch;
430 // Construct a String from the char
431 String s = new String(cArray);
432 // Encode the String into a sequence of bytes
433 // using the given, named charset.
434 byte[] bArray = s.getBytes(encoding);
435 isInEncoding = inEncoding(ch, bArray);
436
437 } catch (Exception e) {
438 isInEncoding = false;
439
440 // If for some reason the encoding is null, e.g.
441 // for a temporary result tree, we should just
442 // say that every character is in the encoding.
443 if (encoding == null)
444 isInEncoding = true;
445 }
446 return isInEncoding;
447 }
448
449 /**
450 * This is heart of the code that determines if a given high/low
451 * surrogate pair forms a character that is in the given encoding.
452 * This method is probably expensive, and the answer should be cached.
453 * <p>
454 * This method is not a public API,
455 * and should only be used internally within the serializer.
456 * @param high the high char of
457 * a high/low surrogate pair.
458 * @param low the low char of a high/low surrogate pair.
459 * @param encoding the Java name of the encoding.
460 *
461 * @xsl.usage internal
462 *
463 */
464 private static boolean inEncoding(char high, char low, String encoding) {
465 boolean isInEncoding;
466 try {
467 char cArray[] = new char[2];
468 cArray[0] = high;
469 cArray[1] = low;
470 // Construct a String from the char
471 String s = new String(cArray);
472 // Encode the String into a sequence of bytes
473 // using the given, named charset.
474 byte[] bArray = s.getBytes(encoding);
475 isInEncoding = inEncoding(high,bArray);
476 } catch (Exception e) {
477 isInEncoding = false;
478 }
479
480 return isInEncoding;
481 }
482
483 /**
484 * This method is the core of determining if character
485 * is in the encoding. The method is not foolproof, because
486 * s.getBytes(encoding) has specified behavior only if the
487 * characters are in the specified encoding. However this
488 * method tries it's best.
489 * @param ch the char that was converted using getBytes, or
490 * the first char of a high/low pair that was converted.
491 * @param data the bytes written out by the call to s.getBytes(encoding);
492 * @return true if the character is in the encoding.
493 */
494 private static boolean inEncoding(char ch, byte[] data) {
495 final boolean isInEncoding;
496 // If the string written out as data is not in the encoding,
497 // the output is not specified according to the documentation
498 // on the String.getBytes(encoding) method,
499 // but we do our best here.
500 if (data==null || data.length == 0) {
501 isInEncoding = false;
502 }
503 else {
504 if (data[0] == 0)
505 isInEncoding = false;
506 else if (data[0] == '?' && ch != '?')
507 isInEncoding = false;
508 /*
509 * else if (isJapanese) {
510 * // isJapanese is really
511 * // ( "EUC-JP".equals(javaName)
512 * // || "EUC_JP".equals(javaName)
513 * // || "SJIS".equals(javaName) )
514 *
515 * // Work around some bugs in JRE for Japanese
516 * if(data[0] == 0x21)
517 * isInEncoding = false;
518 * else if (ch == 0xA5)
519 * isInEncoding = false;
520 * else
521 * isInEncoding = true;
522 * }
523 */
524
525 else {
526 // We don't know for sure, but it looks like it is in the encoding
527 isInEncoding = true;
528 }
529 }
530 return isInEncoding;
531 }
532
533 /**
534 * This method exists for performance reasons.
535 * <p>
536 * Except for '\u0000', if a char is less than or equal to the value
537 * returned by this method then it in the encoding.
538 * <p>
539 * The characters in an encoding are not contiguous, however
540 * there is a lowest group of chars starting at '\u0001' upto and
541 * including the char returned by this method that are all in the encoding.
542 * So the char returned by this method essentially defines the lowest
543 * contiguous group.
544 * <p>
545 * chars above the value returned might be in the encoding, but
546 * chars at or below the value returned are definately in the encoding.
547 * <p>
548 * In any case however, the isInEncoding(char) method can be used
549 * regardless of the value of the char returned by this method.
550 * <p>
551 * If the value returned is '\u0000' it means that every character must be tested
552 * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
553 * for surrogate pairs.
554 * <p>
555 * This method is not a public API.
556 * @xsl.usage internal
557 */
558 public final char getHighChar() {
559 return m_highCharInContiguousGroup;
560 }
561
562 }