001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 /*
019 * $Id: Encodings.java 1225414 2011-12-29 02:38:30Z mrglavas $
020 */
021 package org.apache.xml.serializer;
022
023 import java.io.InputStream;
024 import java.io.OutputStream;
025 import java.io.OutputStreamWriter;
026 import java.io.UnsupportedEncodingException;
027 import java.io.Writer;
028 import java.util.ArrayList;
029 import java.util.Enumeration;
030 import java.util.Hashtable;
031 import java.util.List;
032 import java.util.Properties;
033 import java.util.StringTokenizer;
034
035
036 /**
037 * Provides information about encodings. Depends on the Java runtime
038 * to provides writers for the different encodings.
039 * <p>
040 * This class is not a public API. It is only public because it
041 * is used outside of this package.
042 *
043 * @xsl.usage internal
044 */
045
046 public final class Encodings extends Object
047 {
048 /**
049 * Standard filename for properties file with encodings data.
050 */
051 private static final String ENCODINGS_FILE = SerializerBase.PKG_PATH+"/Encodings.properties";
052
053 /**
054 * Returns a writer for the specified encoding based on
055 * an output stream.
056 * <p>
057 * This is not a public API.
058 * @param output The output stream
059 * @param encoding The encoding MIME name, not a Java name for the encoding.
060 * @return A suitable writer
061 * @throws UnsupportedEncodingException There is no convertor
062 * to support this encoding
063 * @xsl.usage internal
064 */
065 static Writer getWriter(OutputStream output, String encoding)
066 throws UnsupportedEncodingException
067 {
068
069 for (int i = 0; i < _encodings.length; ++i)
070 {
071 if (_encodings[i].name.equalsIgnoreCase(encoding))
072 {
073 try
074 {
075 String javaName = _encodings[i].javaName;
076 OutputStreamWriter osw = new OutputStreamWriter(output,javaName);
077 return osw;
078 }
079 catch (java.lang.IllegalArgumentException iae) // java 1.1.8
080 {
081 // keep trying
082 }
083 catch (UnsupportedEncodingException usee)
084 {
085
086 // keep trying
087 }
088 }
089 }
090
091 try
092 {
093 return new OutputStreamWriter(output, encoding);
094 }
095 catch (java.lang.IllegalArgumentException iae) // java 1.1.8
096 {
097 throw new UnsupportedEncodingException(encoding);
098 }
099 }
100
101 /**
102 * Returns the EncodingInfo object for the specified
103 * encoding, never null, although the encoding name
104 * inside the returned EncodingInfo object will be if
105 * we can't find a "real" EncodingInfo for the encoding.
106 * <p>
107 * This is not a public API.
108 *
109 * @param encoding The encoding
110 * @return The object that is used to determine if
111 * characters are in the given encoding.
112 * @xsl.usage internal
113 */
114 static EncodingInfo getEncodingInfo(String encoding)
115 {
116 EncodingInfo ei;
117
118 String normalizedEncoding = toUpperCaseFast(encoding);
119 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
120 if (ei == null)
121 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
122 if (ei == null) {
123 // We shouldn't have to do this, but just in case.
124 ei = new EncodingInfo(null,null, '\u0000');
125 }
126
127 return ei;
128 }
129
130 /**
131 * Determines if the encoding specified was recognized by the
132 * serializer or not.
133 *
134 * @param encoding The encoding
135 * @return boolean - true if the encoding was recognized else false
136 */
137 public static boolean isRecognizedEncoding(String encoding)
138 {
139 EncodingInfo ei;
140
141 String normalizedEncoding = encoding.toUpperCase();
142 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
143 if (ei == null)
144 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
145 if (ei != null)
146 return true;
147 return false;
148 }
149
150 /**
151 * A fast and cheap way to uppercase a String that is
152 * only made of printable ASCII characters.
153 * <p>
154 * This is not a public API.
155 * @param s a String of ASCII characters
156 * @return an uppercased version of the input String,
157 * possibly the same String.
158 * @xsl.usage internal
159 */
160 static private String toUpperCaseFast(final String s) {
161
162 boolean different = false;
163 final int mx = s.length();
164 char[] chars = new char[mx];
165 for (int i=0; i < mx; i++) {
166 char ch = s.charAt(i);
167 // is the character a lower case ASCII one?
168 if ('a' <= ch && ch <= 'z') {
169 // a cheap and fast way to uppercase that is good enough
170 ch = (char) (ch + ('A' - 'a'));
171 different = true; // the uppercased String is different
172 }
173 chars[i] = ch;
174 }
175
176 // A little optimization, don't call String.valueOf() if
177 // the uppercased string is the same as the input string.
178 final String upper;
179 if (different)
180 upper = String.valueOf(chars);
181 else
182 upper = s;
183
184 return upper;
185 }
186
187 /** The default encoding, ISO style, ISO style. */
188 static final String DEFAULT_MIME_ENCODING = "UTF-8";
189
190 /**
191 * Get the proper mime encoding. From the XSLT recommendation: "The encoding
192 * attribute specifies the preferred encoding to use for outputting the result
193 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
194 * For other values, if the XSLT processor does not support the specified
195 * encoding it may signal an error; if it does not signal an error it should
196 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
197 * whose name does not match the EncName production of the XML Recommendation
198 * [XML]. If no encoding attribute is specified, then the XSLT processor should
199 * use either UTF-8 or UTF-16."
200 * <p>
201 * This is not a public API.
202 *
203 * @param encoding Reference to java-style encoding string, which may be null,
204 * in which case a default will be found.
205 *
206 * @return The ISO-style encoding string, or null if failure.
207 * @xsl.usage internal
208 */
209 static String getMimeEncoding(String encoding)
210 {
211
212 if (null == encoding)
213 {
214 try
215 {
216
217 // Get the default system character encoding. This may be
218 // incorrect if they passed in a writer, but right now there
219 // seems to be no way to get the encoding from a writer.
220 encoding = System.getProperty("file.encoding", "UTF8");
221
222 if (null != encoding)
223 {
224
225 /*
226 * See if the mime type is equal to UTF8. If you don't
227 * do that, then convertJava2MimeEncoding will convert
228 * 8859_1 to "ISO-8859-1", which is not what we want,
229 * I think, and I don't think I want to alter the tables
230 * to convert everything to UTF-8.
231 */
232 String jencoding =
233 (encoding.equalsIgnoreCase("Cp1252")
234 || encoding.equalsIgnoreCase("ISO8859_1")
235 || encoding.equalsIgnoreCase("8859_1")
236 || encoding.equalsIgnoreCase("UTF8"))
237 ? DEFAULT_MIME_ENCODING
238 : convertJava2MimeEncoding(encoding);
239
240 encoding =
241 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
242 }
243 else
244 {
245 encoding = DEFAULT_MIME_ENCODING;
246 }
247 }
248 catch (SecurityException se)
249 {
250 encoding = DEFAULT_MIME_ENCODING;
251 }
252 }
253 else
254 {
255 encoding = convertJava2MimeEncoding(encoding);
256 }
257
258 return encoding;
259 }
260
261 /**
262 * Try the best we can to convert a Java encoding to a XML-style encoding.
263 * <p>
264 * This is not a public API.
265 * @param encoding non-null reference to encoding string, java style.
266 *
267 * @return ISO-style encoding string.
268 * @xsl.usage internal
269 */
270 private static String convertJava2MimeEncoding(String encoding)
271 {
272 EncodingInfo enc =
273 (EncodingInfo) _encodingTableKeyJava.get(toUpperCaseFast(encoding));
274 if (null != enc)
275 return enc.name;
276 return encoding;
277 }
278
279 /**
280 * Try the best we can to convert a Java encoding to a XML-style encoding.
281 * <p>
282 * This is not a public API.
283 *
284 * @param encoding non-null reference to encoding string, java style.
285 *
286 * @return ISO-style encoding string.
287 * <p>
288 * This method is not a public API.
289 * @xsl.usage internal
290 */
291 public static String convertMime2JavaEncoding(String encoding)
292 {
293
294 for (int i = 0; i < _encodings.length; ++i)
295 {
296 if (_encodings[i].name.equalsIgnoreCase(encoding))
297 {
298 return _encodings[i].javaName;
299 }
300 }
301
302 return encoding;
303 }
304
305 /**
306 * Load a list of all the supported encodings.
307 *
308 * System property "encodings" formatted using URL syntax may define an
309 * external encodings list. Thanks to Sergey Ushakov for the code
310 * contribution!
311 * @xsl.usage internal
312 */
313 private static EncodingInfo[] loadEncodingInfo()
314 {
315 try
316 {
317 final InputStream is;
318 is = SecuritySupport.getResourceAsStream(ObjectFactory.findClassLoader(),
319 ENCODINGS_FILE);
320
321 Properties props = new Properties();
322 if (is != null) {
323 props.load(is);
324 is.close();
325 } else {
326 // Seems to be no real need to force failure here, let the
327 // system do its best... The issue is not really very critical,
328 // and the output will be in any case _correct_ though maybe not
329 // always human-friendly... :)
330 // But maybe report/log the resource problem?
331 // Any standard ways to report/log errors (in static context)?
332 }
333
334 int totalEntries = props.size();
335
336 List encodingInfo_list = new ArrayList();
337 Enumeration keys = props.keys();
338 for (int i = 0; i < totalEntries; ++i)
339 {
340 String javaName = (String) keys.nextElement();
341 String val = props.getProperty(javaName);
342 int len = lengthOfMimeNames(val);
343
344 String mimeName;
345 char highChar;
346 if (len == 0)
347 {
348 // There is no property value, only the javaName, so try and recover
349 mimeName = javaName;
350 highChar = '\u0000'; // don't know the high code point, will need to test every character
351 }
352 else
353 {
354 try {
355 // Get the substring after the Mime names
356 final String highVal = val.substring(len).trim();
357 highChar = (char) Integer.decode(highVal).intValue();
358 }
359 catch( NumberFormatException e) {
360 highChar = 0;
361 }
362 String mimeNames = val.substring(0, len);
363 StringTokenizer st =
364 new StringTokenizer(mimeNames, ",");
365 for (boolean first = true;
366 st.hasMoreTokens();
367 first = false)
368 {
369 mimeName = st.nextToken();
370 EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar);
371 encodingInfo_list.add(ei);
372 _encodingTableKeyMime.put(mimeName.toUpperCase(), ei);
373 if (first)
374 _encodingTableKeyJava.put(javaName.toUpperCase(), ei);
375 }
376 }
377 }
378 // Convert the Vector of EncodingInfo objects into an array of them,
379 // as that is the kind of thing this method returns.
380 EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()];
381 encodingInfo_list.toArray(ret_ei);
382 return ret_ei;
383 }
384 catch (java.net.MalformedURLException mue)
385 {
386 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(mue);
387 }
388 catch (java.io.IOException ioe)
389 {
390 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe);
391 }
392 }
393
394 /**
395 * Get the length of the Mime names within the property value
396 * @param val The value of the property, which should contain a comma
397 * separated list of Mime names, followed optionally by a space and the
398 * high char value
399 * @return
400 */
401 private static int lengthOfMimeNames(String val) {
402 // look for the space preceding the optional high char
403 int len = val.indexOf(' ');
404 // If len is zero it means the optional part is not there, so
405 // the value must be all Mime names, so set the length appropriately
406 if (len < 0)
407 len = val.length();
408
409 return len;
410 }
411
412 /**
413 * Return true if the character is the high member of a surrogate pair.
414 * <p>
415 * This is not a public API.
416 * @param ch the character to test
417 * @xsl.usage internal
418 */
419 static boolean isHighUTF16Surrogate(char ch) {
420 return ('\uD800' <= ch && ch <= '\uDBFF');
421 }
422 /**
423 * Return true if the character is the low member of a surrogate pair.
424 * <p>
425 * This is not a public API.
426 * @param ch the character to test
427 * @xsl.usage internal
428 */
429 static boolean isLowUTF16Surrogate(char ch) {
430 return ('\uDC00' <= ch && ch <= '\uDFFF');
431 }
432 /**
433 * Return the unicode code point represented by the high/low surrogate pair.
434 * <p>
435 * This is not a public API.
436 * @param highSurrogate the high char of the high/low pair
437 * @param lowSurrogate the low char of the high/low pair
438 * @xsl.usage internal
439 */
440 static int toCodePoint(char highSurrogate, char lowSurrogate) {
441 int codePoint =
442 ((highSurrogate - 0xd800) << 10)
443 + (lowSurrogate - 0xdc00)
444 + 0x10000;
445 return codePoint;
446 }
447 /**
448 * Return the unicode code point represented by the char.
449 * A bit of a dummy method, since all it does is return the char,
450 * but as an int value.
451 * <p>
452 * This is not a public API.
453 * @param ch the char.
454 * @xsl.usage internal
455 */
456 static int toCodePoint(char ch) {
457 int codePoint = ch;
458 return codePoint;
459 }
460
461 /**
462 * Characters with values at or below the high code point are
463 * in the encoding. Code point values above this one may or may
464 * not be in the encoding, but lower ones certainly are.
465 * <p>
466 * This is for performance.
467 *
468 * @param encoding The encoding
469 * @return The code point for which characters at or below this code point
470 * are in the encoding. Characters with higher code point may or may not be
471 * in the encoding. A value of zero is returned if the high code point is unknown.
472 * <p>
473 * This method is not a public API.
474 * @xsl.usage internal
475 */
476 static public char getHighChar(String encoding)
477 {
478 final char highCodePoint;
479 EncodingInfo ei;
480
481 String normalizedEncoding = toUpperCaseFast(encoding);
482 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
483 if (ei == null)
484 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
485 if (ei != null)
486 highCodePoint = ei.getHighChar();
487 else
488 highCodePoint = 0;
489 return highCodePoint;
490 }
491
492 private static final Hashtable _encodingTableKeyJava = new Hashtable();
493 private static final Hashtable _encodingTableKeyMime = new Hashtable();
494 private static final EncodingInfo[] _encodings = loadEncodingInfo();
495 }