001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.xml.utils;
020
021 import java.util.Arrays;
022
023
024 /**
025 * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
026 *
027 * This class defines the basic properties of characters in XML 1.1. The data
028 * in this class can be used to verify that a character is a valid
029 * XML 1.1 character or if the character is a space, name start, or name
030 * character.
031 * <p>
032 * A series of convenience methods are supplied to ease the burden
033 * of the developer. Using the character as an index into the <code>XML11CHARS</code>
034 * array and applying the appropriate mask flag (e.g.
035 * <code>MASK_VALID</code>), yields the same results as calling the
036 * convenience methods. There is one exception: check the comments
037 * for the <code>isValid</code> method for details.
038 *
039 * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $
040 */
041 public class XML11Char {
042
043 //
044 // Constants
045 //
046
047 /** Character flags for XML 1.1. */
048 private static final byte XML11CHARS [] = new byte [1 << 16];
049
050 /** XML 1.1 Valid character mask. */
051 public static final int MASK_XML11_VALID = 0x01;
052
053 /** XML 1.1 Space character mask. */
054 public static final int MASK_XML11_SPACE = 0x02;
055
056 /** XML 1.1 Name start character mask. */
057 public static final int MASK_XML11_NAME_START = 0x04;
058
059 /** XML 1.1 Name character mask. */
060 public static final int MASK_XML11_NAME = 0x08;
061
062 /** XML 1.1 control character mask */
063 public static final int MASK_XML11_CONTROL = 0x10;
064
065 /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
066 public static final int MASK_XML11_CONTENT = 0x20;
067
068 /** XML namespaces 1.1 NCNameStart */
069 public static final int MASK_XML11_NCNAME_START = 0x40;
070
071 /** XML namespaces 1.1 NCName */
072 public static final int MASK_XML11_NCNAME = 0x80;
073
074 /** XML 1.1 content for internal entities (valid - "special" chars) */
075 public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
076
077 //
078 // Static initialization
079 //
080
081 static {
082
083 // Initializing the Character Flag Array
084 // Code generated by: XML11CharGenerator.
085
086 Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
087 XML11CHARS[9] = 35;
088 XML11CHARS[10] = 3;
089 Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
090 XML11CHARS[13] = 3;
091 Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
092 XML11CHARS[32] = 35;
093 Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
094 XML11CHARS[38] = 1;
095 Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
096 Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
097 XML11CHARS[47] = 33;
098 Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
099 XML11CHARS[58] = 45;
100 XML11CHARS[59] = 33;
101 XML11CHARS[60] = 1;
102 Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
103 Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
104 Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
105 XML11CHARS[93] = 1;
106 XML11CHARS[94] = 33;
107 XML11CHARS[95] = -19;
108 XML11CHARS[96] = 33;
109 Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
110 Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
111 Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
112 XML11CHARS[133] = 35;
113 Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
114 Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
115 XML11CHARS[183] = -87;
116 Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
117 Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
118 XML11CHARS[215] = 33;
119 Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
120 XML11CHARS[247] = 33;
121 Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
122 Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
123 Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
124 XML11CHARS[894] = 33;
125 Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
126 Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
127 Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
128 Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
129 XML11CHARS[8232] = 35;
130 Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
131 Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
132 Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
133 Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
134 Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
135 Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
136 Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
137 Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
138 Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
139 Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
140 Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
141 Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
142
143 } // <clinit>()
144
145 //
146 // Public static methods
147 //
148
149 /**
150 * Returns true if the specified character is a space character
151 * as amdended in the XML 1.1 specification.
152 *
153 * @param c The character to check.
154 */
155 public static boolean isXML11Space(int c) {
156 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
157 } // isXML11Space(int):boolean
158
159 /**
160 * Returns true if the specified character is valid. This method
161 * also checks the surrogate character range from 0x10000 to 0x10FFFF.
162 * <p>
163 * If the program chooses to apply the mask directly to the
164 * <code>XML11CHARS</code> array, then they are responsible for checking
165 * the surrogate character range.
166 *
167 * @param c The character to check.
168 */
169 public static boolean isXML11Valid(int c) {
170 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
171 || (0x10000 <= c && c <= 0x10FFFF);
172 } // isXML11Valid(int):boolean
173
174 /**
175 * Returns true if the specified character is invalid.
176 *
177 * @param c The character to check.
178 */
179 public static boolean isXML11Invalid(int c) {
180 return !isXML11Valid(c);
181 } // isXML11Invalid(int):boolean
182
183 /**
184 * Returns true if the specified character is valid and permitted outside
185 * of a character reference.
186 * That is, this method will return false for the same set as
187 * isXML11Valid, except it also reports false for "control characters".
188 *
189 * @param c The character to check.
190 */
191 public static boolean isXML11ValidLiteral(int c) {
192 return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
193 || (0x10000 <= c && c <= 0x10FFFF));
194 } // isXML11ValidLiteral(int):boolean
195
196 /**
197 * Returns true if the specified character can be considered
198 * content in an external parsed entity.
199 *
200 * @param c The character to check.
201 */
202 public static boolean isXML11Content(int c) {
203 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
204 (0x10000 <= c && c <= 0x10FFFF);
205 } // isXML11Content(int):boolean
206
207 /**
208 * Returns true if the specified character can be considered
209 * content in an internal parsed entity.
210 *
211 * @param c The character to check.
212 */
213 public static boolean isXML11InternalEntityContent(int c) {
214 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
215 (0x10000 <= c && c <= 0x10FFFF);
216 } // isXML11InternalEntityContent(int):boolean
217
218 /**
219 * Returns true if the specified character is a valid name start
220 * character as defined by production [4] in the XML 1.1
221 * specification.
222 *
223 * @param c The character to check.
224 */
225 public static boolean isXML11NameStart(int c) {
226 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
227 || (0x10000 <= c && c < 0xF0000);
228 } // isXML11NameStart(int):boolean
229
230 /**
231 * Returns true if the specified character is a valid name
232 * character as defined by production [4a] in the XML 1.1
233 * specification.
234 *
235 * @param c The character to check.
236 */
237 public static boolean isXML11Name(int c) {
238 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
239 || (c >= 0x10000 && c < 0xF0000);
240 } // isXML11Name(int):boolean
241
242 /**
243 * Returns true if the specified character is a valid NCName start
244 * character as defined by production [4] in Namespaces in XML
245 * 1.1 recommendation.
246 *
247 * @param c The character to check.
248 */
249 public static boolean isXML11NCNameStart(int c) {
250 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
251 || (0x10000 <= c && c < 0xF0000);
252 } // isXML11NCNameStart(int):boolean
253
254 /**
255 * Returns true if the specified character is a valid NCName
256 * character as defined by production [5] in Namespaces in XML
257 * 1.1 recommendation.
258 *
259 * @param c The character to check.
260 */
261 public static boolean isXML11NCName(int c) {
262 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
263 || (0x10000 <= c && c < 0xF0000);
264 } // isXML11NCName(int):boolean
265
266 /**
267 * Returns whether the given character is a valid
268 * high surrogate for a name character. This includes
269 * all high surrogates for characters [0x10000-0xEFFFF].
270 * In other words everything excluding planes 15 and 16.
271 *
272 * @param c The character to check.
273 */
274 public static boolean isXML11NameHighSurrogate(int c) {
275 return (0xD800 <= c && c <= 0xDB7F);
276 }
277
278 /*
279 * [5] Name ::= NameStartChar NameChar*
280 */
281 /**
282 * Check to see if a string is a valid Name according to [5]
283 * in the XML 1.1 Recommendation
284 *
285 * @param name string to check
286 * @return true if name is a valid Name
287 */
288 public static boolean isXML11ValidName(String name) {
289 int length = name.length();
290 if (length == 0)
291 return false;
292 int i = 1;
293 char ch = name.charAt(0);
294 if( !isXML11NameStart(ch) ) {
295 if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
296 char ch2 = name.charAt(1);
297 if ( !XMLChar.isLowSurrogate(ch2) ||
298 !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
299 return false;
300 }
301 i = 2;
302 }
303 else {
304 return false;
305 }
306 }
307 while (i < length) {
308 ch = name.charAt(i);
309 if ( !isXML11Name(ch) ) {
310 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
311 char ch2 = name.charAt(i);
312 if ( !XMLChar.isLowSurrogate(ch2) ||
313 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
314 return false;
315 }
316 }
317 else {
318 return false;
319 }
320 }
321 ++i;
322 }
323 return true;
324 } // isXML11ValidName(String):boolean
325
326
327 /*
328 * from the namespace 1.1 rec
329 * [4] NCName ::= NCNameStartChar NCNameChar*
330 */
331 /**
332 * Check to see if a string is a valid NCName according to [4]
333 * from the XML Namespaces 1.1 Recommendation
334 *
335 * @param ncName string to check
336 * @return true if name is a valid NCName
337 */
338 public static boolean isXML11ValidNCName(String ncName) {
339 int length = ncName.length();
340 if (length == 0)
341 return false;
342 int i = 1;
343 char ch = ncName.charAt(0);
344 if( !isXML11NCNameStart(ch) ) {
345 if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
346 char ch2 = ncName.charAt(1);
347 if ( !XMLChar.isLowSurrogate(ch2) ||
348 !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
349 return false;
350 }
351 i = 2;
352 }
353 else {
354 return false;
355 }
356 }
357 while (i < length) {
358 ch = ncName.charAt(i);
359 if ( !isXML11NCName(ch) ) {
360 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
361 char ch2 = ncName.charAt(i);
362 if ( !XMLChar.isLowSurrogate(ch2) ||
363 !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
364 return false;
365 }
366 }
367 else {
368 return false;
369 }
370 }
371 ++i;
372 }
373 return true;
374 } // isXML11ValidNCName(String):boolean
375
376 /*
377 * [7] Nmtoken ::= (NameChar)+
378 */
379 /**
380 * Check to see if a string is a valid Nmtoken according to [7]
381 * in the XML 1.1 Recommendation
382 *
383 * @param nmtoken string to check
384 * @return true if nmtoken is a valid Nmtoken
385 */
386 public static boolean isXML11ValidNmtoken(String nmtoken) {
387 int length = nmtoken.length();
388 if (length == 0)
389 return false;
390 for (int i = 0; i < length; ++i ) {
391 char ch = nmtoken.charAt(i);
392 if( !isXML11Name(ch) ) {
393 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
394 char ch2 = nmtoken.charAt(i);
395 if ( !XMLChar.isLowSurrogate(ch2) ||
396 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
397 return false;
398 }
399 }
400 else {
401 return false;
402 }
403 }
404 }
405 return true;
406 } // isXML11ValidName(String):boolean
407
408 /**
409 * Simple check to determine if qname is legal. If it returns false
410 * then <param>str</param> is illegal; if it returns true then
411 * <param>str</param> is legal.
412 */
413 public static boolean isXML11ValidQName(String str) {
414
415 final int colon = str.indexOf(':');
416
417 if (colon == 0 || colon == str.length() - 1) {
418 return false;
419 }
420
421 if (colon > 0) {
422 final String prefix = str.substring(0,colon);
423 final String localPart = str.substring(colon+1);
424 return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart);
425 }
426 else {
427 return isXML11ValidNCName(str);
428 }
429 }
430
431 } // class XML11Char
432