001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements. See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership. The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the  "License");
007     * you may not use this file except in compliance with the License.
008     * You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.xml.utils;
020    
021    import java.util.Arrays;
022    
023    
024    /**
025     * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
026     *  
027     * This class defines the basic properties of characters in XML 1.1. The data
028     * in this class can be used to verify that a character is a valid
029     * XML 1.1 character or if the character is a space, name start, or name
030     * character.
031     * <p>
032     * A series of convenience methods are supplied to ease the burden
033     * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
034     * array and applying the appropriate mask flag (e.g.
035     * <code>MASK_VALID</code>), yields the same results as calling the
036     * convenience methods. There is one exception: check the comments
037     * for the <code>isValid</code> method for details.
038     *
039     * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $
040     */
041    public class XML11Char {
042    
043        //
044        // Constants
045        //
046    
047        /** Character flags for XML 1.1. */
048        private static final byte XML11CHARS [] = new byte [1 << 16];
049    
050        /** XML 1.1 Valid character mask. */
051        public static final int MASK_XML11_VALID = 0x01;
052    
053        /** XML 1.1 Space character mask. */
054        public static final int MASK_XML11_SPACE = 0x02;
055    
056        /** XML 1.1 Name start character mask. */
057        public static final int MASK_XML11_NAME_START = 0x04;
058    
059        /** XML 1.1 Name character mask. */
060        public static final int MASK_XML11_NAME = 0x08;
061    
062        /** XML 1.1 control character mask */
063        public static final int MASK_XML11_CONTROL = 0x10;
064    
065        /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
066        public static final int MASK_XML11_CONTENT = 0x20;
067    
068        /** XML namespaces 1.1 NCNameStart */
069        public static final int MASK_XML11_NCNAME_START = 0x40;
070    
071        /** XML namespaces 1.1 NCName */
072        public static final int MASK_XML11_NCNAME = 0x80;
073        
074        /** XML 1.1 content for internal entities (valid - "special" chars) */
075        public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT; 
076    
077        //
078        // Static initialization
079        //
080    
081        static {
082            
083            // Initializing the Character Flag Array
084            // Code generated by: XML11CharGenerator.
085            
086            Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
087            XML11CHARS[9] = 35;
088            XML11CHARS[10] = 3;
089            Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
090            XML11CHARS[13] = 3;
091            Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
092            XML11CHARS[32] = 35;
093            Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
094            XML11CHARS[38] = 1;
095            Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
096            Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
097            XML11CHARS[47] = 33;
098            Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
099            XML11CHARS[58] = 45;
100            XML11CHARS[59] = 33;
101            XML11CHARS[60] = 1;
102            Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
103            Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
104            Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
105            XML11CHARS[93] = 1;
106            XML11CHARS[94] = 33;
107            XML11CHARS[95] = -19;
108            XML11CHARS[96] = 33;
109            Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
110            Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
111            Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
112            XML11CHARS[133] = 35;
113            Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
114            Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
115            XML11CHARS[183] = -87;
116            Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
117            Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
118            XML11CHARS[215] = 33;
119            Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
120            XML11CHARS[247] = 33;
121            Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
122            Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
123            Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
124            XML11CHARS[894] = 33;
125            Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
126            Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
127            Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
128            Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
129            XML11CHARS[8232] = 35;
130            Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
131            Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
132            Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
133            Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
134            Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
135            Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
136            Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
137            Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
138            Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
139            Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
140            Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
141            Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
142    
143        } // <clinit>()
144    
145        //
146        // Public static methods
147        //
148    
149        /**
150         * Returns true if the specified character is a space character
151         * as amdended in the XML 1.1 specification.
152         *
153         * @param c The character to check.
154         */
155        public static boolean isXML11Space(int c) {
156            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
157        } // isXML11Space(int):boolean
158    
159        /**
160         * Returns true if the specified character is valid. This method
161         * also checks the surrogate character range from 0x10000 to 0x10FFFF.
162         * <p>
163         * If the program chooses to apply the mask directly to the
164         * <code>XML11CHARS</code> array, then they are responsible for checking
165         * the surrogate character range.
166         *
167         * @param c The character to check.
168         */
169        public static boolean isXML11Valid(int c) {
170            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0) 
171                    || (0x10000 <= c && c <= 0x10FFFF);
172        } // isXML11Valid(int):boolean
173    
174        /**
175         * Returns true if the specified character is invalid.
176         *
177         * @param c The character to check.
178         */
179        public static boolean isXML11Invalid(int c) {
180            return !isXML11Valid(c);
181        } // isXML11Invalid(int):boolean
182    
183        /**
184         * Returns true if the specified character is valid and permitted outside
185         * of a character reference.  
186         * That is, this method will return false for the same set as
187         * isXML11Valid, except it also reports false for "control characters".
188         *
189         * @param c The character to check.
190         */
191        public static boolean isXML11ValidLiteral(int c) {
192            return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
193                || (0x10000 <= c && c <= 0x10FFFF)); 
194        } // isXML11ValidLiteral(int):boolean
195    
196        /**
197         * Returns true if the specified character can be considered 
198         * content in an external parsed entity.
199         *
200         * @param c The character to check.
201         */
202        public static boolean isXML11Content(int c) {
203            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
204                   (0x10000 <= c && c <= 0x10FFFF);
205        } // isXML11Content(int):boolean
206        
207        /**
208         * Returns true if the specified character can be considered 
209         * content in an internal parsed entity.
210         *
211         * @param c The character to check.
212         */
213        public static boolean isXML11InternalEntityContent(int c) {
214            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
215                   (0x10000 <= c && c <= 0x10FFFF);
216        } // isXML11InternalEntityContent(int):boolean
217    
218        /**
219         * Returns true if the specified character is a valid name start
220         * character as defined by production [4] in the XML 1.1
221         * specification.
222         *
223         * @param c The character to check.
224         */
225        public static boolean isXML11NameStart(int c) {
226            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
227                || (0x10000 <= c && c < 0xF0000);
228        } // isXML11NameStart(int):boolean
229    
230        /**
231         * Returns true if the specified character is a valid name
232         * character as defined by production [4a] in the XML 1.1
233         * specification.
234         *
235         * @param c The character to check.
236         */
237        public static boolean isXML11Name(int c) {
238            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0) 
239                || (c >= 0x10000 && c < 0xF0000);
240        } // isXML11Name(int):boolean
241    
242        /**
243         * Returns true if the specified character is a valid NCName start
244         * character as defined by production [4] in Namespaces in XML
245         * 1.1 recommendation.
246         *
247         * @param c The character to check.
248         */
249        public static boolean isXML11NCNameStart(int c) {
250            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
251                || (0x10000 <= c && c < 0xF0000);
252        } // isXML11NCNameStart(int):boolean
253    
254        /**
255         * Returns true if the specified character is a valid NCName
256         * character as defined by production [5] in Namespaces in XML
257         * 1.1 recommendation.
258         *
259         * @param c The character to check.
260         */
261        public static boolean isXML11NCName(int c) {
262            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
263                || (0x10000 <= c && c < 0xF0000);
264        } // isXML11NCName(int):boolean
265        
266        /**
267         * Returns whether the given character is a valid 
268         * high surrogate for a name character. This includes
269         * all high surrogates for characters [0x10000-0xEFFFF].
270         * In other words everything excluding planes 15 and 16.
271         *
272         * @param c The character to check.
273         */
274        public static boolean isXML11NameHighSurrogate(int c) {
275            return (0xD800 <= c && c <= 0xDB7F);
276        }
277    
278        /*
279         * [5] Name ::= NameStartChar NameChar*
280         */
281        /**
282         * Check to see if a string is a valid Name according to [5]
283         * in the XML 1.1 Recommendation
284         *
285         * @param name string to check
286         * @return true if name is a valid Name
287         */
288        public static boolean isXML11ValidName(String name) {
289            int length = name.length();
290            if (length == 0)
291                return false;
292            int i = 1;
293            char ch = name.charAt(0);
294            if( !isXML11NameStart(ch) ) {
295                if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
296                    char ch2 = name.charAt(1);
297                    if ( !XMLChar.isLowSurrogate(ch2) || 
298                         !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
299                        return false;
300                    }
301                    i = 2;
302                }
303                else {
304                    return false;
305                }
306            }
307            while (i < length) {
308                ch = name.charAt(i);
309                if ( !isXML11Name(ch) ) {
310                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
311                        char ch2 = name.charAt(i);
312                        if ( !XMLChar.isLowSurrogate(ch2) || 
313                             !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
314                            return false;
315                        }
316                    }
317                    else {
318                        return false;
319                    }
320                }
321                ++i;
322            }
323            return true;
324        } // isXML11ValidName(String):boolean
325        
326    
327        /*
328         * from the namespace 1.1 rec
329         * [4] NCName ::= NCNameStartChar NCNameChar*
330         */
331        /**
332         * Check to see if a string is a valid NCName according to [4]
333         * from the XML Namespaces 1.1 Recommendation
334         *
335         * @param ncName string to check
336         * @return true if name is a valid NCName
337         */
338        public static boolean isXML11ValidNCName(String ncName) {
339            int length = ncName.length();
340            if (length == 0)
341                return false;
342            int i = 1;
343            char ch = ncName.charAt(0);
344            if( !isXML11NCNameStart(ch) ) {
345                if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
346                    char ch2 = ncName.charAt(1);
347                    if ( !XMLChar.isLowSurrogate(ch2) || 
348                         !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
349                        return false;
350                    }
351                    i = 2;
352                }
353                else {
354                    return false;
355                }
356            }
357            while (i < length) {
358                ch = ncName.charAt(i);
359                if ( !isXML11NCName(ch) ) {
360                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
361                        char ch2 = ncName.charAt(i);
362                        if ( !XMLChar.isLowSurrogate(ch2) || 
363                             !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
364                            return false;
365                        }
366                    }
367                    else {
368                        return false;
369                    }
370                }
371                ++i;
372            }
373            return true;
374        } // isXML11ValidNCName(String):boolean
375    
376        /*
377         * [7] Nmtoken ::= (NameChar)+
378         */
379        /**
380         * Check to see if a string is a valid Nmtoken according to [7]
381         * in the XML 1.1 Recommendation
382         *
383         * @param nmtoken string to check
384         * @return true if nmtoken is a valid Nmtoken 
385         */
386        public static boolean isXML11ValidNmtoken(String nmtoken) {
387            int length = nmtoken.length();
388            if (length == 0)
389                return false;
390            for (int i = 0; i < length; ++i ) {
391                char ch = nmtoken.charAt(i);
392                if( !isXML11Name(ch) ) {
393                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
394                        char ch2 = nmtoken.charAt(i);
395                        if ( !XMLChar.isLowSurrogate(ch2) || 
396                             !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
397                            return false;
398                        }
399                    }
400                    else {
401                        return false;
402                    }
403                }
404            }
405            return true;
406        } // isXML11ValidName(String):boolean
407        
408        /**
409          * Simple check to determine if qname is legal. If it returns false
410          * then <param>str</param> is illegal; if it returns true then 
411          * <param>str</param> is legal.
412          */
413         public static boolean isXML11ValidQName(String str) {
414    
415            final int colon = str.indexOf(':');
416    
417            if (colon == 0 || colon == str.length() - 1) {
418                return false;
419            }
420           
421            if (colon > 0) {
422                final String prefix = str.substring(0,colon);
423                final String localPart = str.substring(colon+1);
424                return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart);
425            }
426            else {
427                return isXML11ValidNCName(str);
428            }
429         }
430    
431    } // class XML11Char
432