001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.xml.serializer.utils;
020
021 import java.util.Arrays;
022
023 /**
024 * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
025 *
026 * This class defines the basic properties of characters in XML 1.1. The data
027 * in this class can be used to verify that a character is a valid
028 * XML 1.1 character or if the character is a space, name start, or name
029 * character.
030 * <p>
031 * A series of convenience methods are supplied to ease the burden
032 * of the developer. Using the character as an index into the <code>XML11CHARS</code>
033 * array and applying the appropriate mask flag (e.g.
034 * <code>MASK_VALID</code>), yields the same results as calling the
035 * convenience methods. There is one exception: check the comments
036 * for the <code>isValid</code> method for details.
037 *
038 * @author Glenn Marcy, IBM
039 * @author Andy Clark, IBM
040 * @author Arnaud Le Hors, IBM
041 * @author Neil Graham, IBM
042 * @author Michael Glavassevich, IBM
043 *
044 * @version $Id: XML11Char.java 1225426 2011-12-29 04:13:08Z mrglavas $
045 */
046 public class XML11Char {
047
048 //
049 // Constants
050 //
051
052 /** Character flags for XML 1.1. */
053 private static final byte XML11CHARS [] = new byte [1 << 16];
054
055 /** XML 1.1 Valid character mask. */
056 public static final int MASK_XML11_VALID = 0x01;
057
058 /** XML 1.1 Space character mask. */
059 public static final int MASK_XML11_SPACE = 0x02;
060
061 /** XML 1.1 Name start character mask. */
062 public static final int MASK_XML11_NAME_START = 0x04;
063
064 /** XML 1.1 Name character mask. */
065 public static final int MASK_XML11_NAME = 0x08;
066
067 /** XML 1.1 control character mask */
068 public static final int MASK_XML11_CONTROL = 0x10;
069
070 /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
071 public static final int MASK_XML11_CONTENT = 0x20;
072
073 /** XML namespaces 1.1 NCNameStart */
074 public static final int MASK_XML11_NCNAME_START = 0x40;
075
076 /** XML namespaces 1.1 NCName */
077 public static final int MASK_XML11_NCNAME = 0x80;
078
079 /** XML 1.1 content for internal entities (valid - "special" chars) */
080 public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
081
082 //
083 // Static initialization
084 //
085
086 static {
087
088 // Initializing the Character Flag Array
089 // Code generated by: XML11CharGenerator.
090
091 Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
092 XML11CHARS[9] = 35;
093 XML11CHARS[10] = 3;
094 Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
095 XML11CHARS[13] = 3;
096 Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
097 XML11CHARS[32] = 35;
098 Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
099 XML11CHARS[38] = 1;
100 Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
101 Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
102 XML11CHARS[47] = 33;
103 Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
104 XML11CHARS[58] = 45;
105 XML11CHARS[59] = 33;
106 XML11CHARS[60] = 1;
107 Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
108 Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
109 Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
110 XML11CHARS[93] = 1;
111 XML11CHARS[94] = 33;
112 XML11CHARS[95] = -19;
113 XML11CHARS[96] = 33;
114 Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
115 Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
116 Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
117 XML11CHARS[133] = 35;
118 Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
119 Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
120 XML11CHARS[183] = -87;
121 Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
122 Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
123 XML11CHARS[215] = 33;
124 Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
125 XML11CHARS[247] = 33;
126 Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
127 Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
128 Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
129 XML11CHARS[894] = 33;
130 Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
131 Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
132 Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
133 Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
134 XML11CHARS[8232] = 35;
135 Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
136 Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
137 Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
138 Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
139 Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
140 Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
141 Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
142 Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
143 Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
144 Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
145 Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
146 Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
147
148 } // <clinit>()
149
150 //
151 // Public static methods
152 //
153
154 /**
155 * Returns true if the specified character is a space character
156 * as amdended in the XML 1.1 specification.
157 *
158 * @param c The character to check.
159 */
160 public static boolean isXML11Space(int c) {
161 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
162 } // isXML11Space(int):boolean
163
164 /**
165 * Returns true if the specified character is valid. This method
166 * also checks the surrogate character range from 0x10000 to 0x10FFFF.
167 * <p>
168 * If the program chooses to apply the mask directly to the
169 * <code>XML11CHARS</code> array, then they are responsible for checking
170 * the surrogate character range.
171 *
172 * @param c The character to check.
173 */
174 public static boolean isXML11Valid(int c) {
175 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
176 || (0x10000 <= c && c <= 0x10FFFF);
177 } // isXML11Valid(int):boolean
178
179 /**
180 * Returns true if the specified character is invalid.
181 *
182 * @param c The character to check.
183 */
184 public static boolean isXML11Invalid(int c) {
185 return !isXML11Valid(c);
186 } // isXML11Invalid(int):boolean
187
188 /**
189 * Returns true if the specified character is valid and permitted outside
190 * of a character reference.
191 * That is, this method will return false for the same set as
192 * isXML11Valid, except it also reports false for "control characters".
193 *
194 * @param c The character to check.
195 */
196 public static boolean isXML11ValidLiteral(int c) {
197 return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
198 || (0x10000 <= c && c <= 0x10FFFF));
199 } // isXML11ValidLiteral(int):boolean
200
201 /**
202 * Returns true if the specified character can be considered
203 * content in an external parsed entity.
204 *
205 * @param c The character to check.
206 */
207 public static boolean isXML11Content(int c) {
208 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
209 (0x10000 <= c && c <= 0x10FFFF);
210 } // isXML11Content(int):boolean
211
212 /**
213 * Returns true if the specified character can be considered
214 * content in an internal parsed entity.
215 *
216 * @param c The character to check.
217 */
218 public static boolean isXML11InternalEntityContent(int c) {
219 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
220 (0x10000 <= c && c <= 0x10FFFF);
221 } // isXML11InternalEntityContent(int):boolean
222
223 /**
224 * Returns true if the specified character is a valid name start
225 * character as defined by production [4] in the XML 1.1
226 * specification.
227 *
228 * @param c The character to check.
229 */
230 public static boolean isXML11NameStart(int c) {
231 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
232 || (0x10000 <= c && c < 0xF0000);
233 } // isXML11NameStart(int):boolean
234
235 /**
236 * Returns true if the specified character is a valid name
237 * character as defined by production [4a] in the XML 1.1
238 * specification.
239 *
240 * @param c The character to check.
241 */
242 public static boolean isXML11Name(int c) {
243 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
244 || (c >= 0x10000 && c < 0xF0000);
245 } // isXML11Name(int):boolean
246
247 /**
248 * Returns true if the specified character is a valid NCName start
249 * character as defined by production [4] in Namespaces in XML
250 * 1.1 recommendation.
251 *
252 * @param c The character to check.
253 */
254 public static boolean isXML11NCNameStart(int c) {
255 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
256 || (0x10000 <= c && c < 0xF0000);
257 } // isXML11NCNameStart(int):boolean
258
259 /**
260 * Returns true if the specified character is a valid NCName
261 * character as defined by production [5] in Namespaces in XML
262 * 1.1 recommendation.
263 *
264 * @param c The character to check.
265 */
266 public static boolean isXML11NCName(int c) {
267 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
268 || (0x10000 <= c && c < 0xF0000);
269 } // isXML11NCName(int):boolean
270
271 /**
272 * Returns whether the given character is a valid
273 * high surrogate for a name character. This includes
274 * all high surrogates for characters [0x10000-0xEFFFF].
275 * In other words everything excluding planes 15 and 16.
276 *
277 * @param c The character to check.
278 */
279 public static boolean isXML11NameHighSurrogate(int c) {
280 return (0xD800 <= c && c <= 0xDB7F);
281 }
282
283 /*
284 * [5] Name ::= NameStartChar NameChar*
285 */
286 /**
287 * Check to see if a string is a valid Name according to [5]
288 * in the XML 1.1 Recommendation
289 *
290 * @param name string to check
291 * @return true if name is a valid Name
292 */
293 public static boolean isXML11ValidName(String name) {
294 int length = name.length();
295 if (length == 0)
296 return false;
297 int i = 1;
298 char ch = name.charAt(0);
299 if( !isXML11NameStart(ch) ) {
300 if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
301 char ch2 = name.charAt(1);
302 if ( !XMLChar.isLowSurrogate(ch2) ||
303 !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
304 return false;
305 }
306 i = 2;
307 }
308 else {
309 return false;
310 }
311 }
312 while (i < length) {
313 ch = name.charAt(i);
314 if ( !isXML11Name(ch) ) {
315 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
316 char ch2 = name.charAt(i);
317 if ( !XMLChar.isLowSurrogate(ch2) ||
318 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
319 return false;
320 }
321 }
322 else {
323 return false;
324 }
325 }
326 ++i;
327 }
328 return true;
329 } // isXML11ValidName(String):boolean
330
331
332 /*
333 * from the namespace 1.1 rec
334 * [4] NCName ::= NCNameStartChar NCNameChar*
335 */
336 /**
337 * Check to see if a string is a valid NCName according to [4]
338 * from the XML Namespaces 1.1 Recommendation
339 *
340 * @param ncName string to check
341 * @return true if name is a valid NCName
342 */
343 public static boolean isXML11ValidNCName(String ncName) {
344 int length = ncName.length();
345 if (length == 0)
346 return false;
347 int i = 1;
348 char ch = ncName.charAt(0);
349 if( !isXML11NCNameStart(ch) ) {
350 if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
351 char ch2 = ncName.charAt(1);
352 if ( !XMLChar.isLowSurrogate(ch2) ||
353 !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
354 return false;
355 }
356 i = 2;
357 }
358 else {
359 return false;
360 }
361 }
362 while (i < length) {
363 ch = ncName.charAt(i);
364 if ( !isXML11NCName(ch) ) {
365 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
366 char ch2 = ncName.charAt(i);
367 if ( !XMLChar.isLowSurrogate(ch2) ||
368 !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
369 return false;
370 }
371 }
372 else {
373 return false;
374 }
375 }
376 ++i;
377 }
378 return true;
379 } // isXML11ValidNCName(String):boolean
380
381 /*
382 * [7] Nmtoken ::= (NameChar)+
383 */
384 /**
385 * Check to see if a string is a valid Nmtoken according to [7]
386 * in the XML 1.1 Recommendation
387 *
388 * @param nmtoken string to check
389 * @return true if nmtoken is a valid Nmtoken
390 */
391 public static boolean isXML11ValidNmtoken(String nmtoken) {
392 int length = nmtoken.length();
393 if (length == 0)
394 return false;
395 for (int i = 0; i < length; ++i ) {
396 char ch = nmtoken.charAt(i);
397 if( !isXML11Name(ch) ) {
398 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
399 char ch2 = nmtoken.charAt(i);
400 if ( !XMLChar.isLowSurrogate(ch2) ||
401 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
402 return false;
403 }
404 }
405 else {
406 return false;
407 }
408 }
409 }
410 return true;
411 } // isXML11ValidName(String):boolean
412
413 } // class XML11Char
414