001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements. See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership. The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the  "License");
007     * you may not use this file except in compliance with the License.
008     * You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.xml.serializer.utils;
020    
021    import java.util.Arrays;
022    
023    /**
024     * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
025     *  
026     * This class defines the basic properties of characters in XML 1.1. The data
027     * in this class can be used to verify that a character is a valid
028     * XML 1.1 character or if the character is a space, name start, or name
029     * character.
030     * <p>
031     * A series of convenience methods are supplied to ease the burden
032     * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
033     * array and applying the appropriate mask flag (e.g.
034     * <code>MASK_VALID</code>), yields the same results as calling the
035     * convenience methods. There is one exception: check the comments
036     * for the <code>isValid</code> method for details.
037     *
038     * @author Glenn Marcy, IBM
039     * @author Andy Clark, IBM
040     * @author Arnaud  Le Hors, IBM
041     * @author Neil Graham, IBM
042     * @author Michael Glavassevich, IBM
043     *
044     * @version $Id: XML11Char.java 1225426 2011-12-29 04:13:08Z mrglavas $
045     */
046    public class XML11Char {
047    
048        //
049        // Constants
050        //
051    
052        /** Character flags for XML 1.1. */
053        private static final byte XML11CHARS [] = new byte [1 << 16];
054    
055        /** XML 1.1 Valid character mask. */
056        public static final int MASK_XML11_VALID = 0x01;
057    
058        /** XML 1.1 Space character mask. */
059        public static final int MASK_XML11_SPACE = 0x02;
060    
061        /** XML 1.1 Name start character mask. */
062        public static final int MASK_XML11_NAME_START = 0x04;
063    
064        /** XML 1.1 Name character mask. */
065        public static final int MASK_XML11_NAME = 0x08;
066    
067        /** XML 1.1 control character mask */
068        public static final int MASK_XML11_CONTROL = 0x10;
069    
070        /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
071        public static final int MASK_XML11_CONTENT = 0x20;
072    
073        /** XML namespaces 1.1 NCNameStart */
074        public static final int MASK_XML11_NCNAME_START = 0x40;
075    
076        /** XML namespaces 1.1 NCName */
077        public static final int MASK_XML11_NCNAME = 0x80;
078        
079        /** XML 1.1 content for internal entities (valid - "special" chars) */
080        public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT; 
081    
082        //
083        // Static initialization
084        //
085    
086        static {
087            
088            // Initializing the Character Flag Array
089            // Code generated by: XML11CharGenerator.
090            
091            Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
092            XML11CHARS[9] = 35;
093            XML11CHARS[10] = 3;
094            Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
095            XML11CHARS[13] = 3;
096            Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
097            XML11CHARS[32] = 35;
098            Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
099            XML11CHARS[38] = 1;
100            Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
101            Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
102            XML11CHARS[47] = 33;
103            Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
104            XML11CHARS[58] = 45;
105            XML11CHARS[59] = 33;
106            XML11CHARS[60] = 1;
107            Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
108            Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
109            Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
110            XML11CHARS[93] = 1;
111            XML11CHARS[94] = 33;
112            XML11CHARS[95] = -19;
113            XML11CHARS[96] = 33;
114            Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
115            Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
116            Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
117            XML11CHARS[133] = 35;
118            Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
119            Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
120            XML11CHARS[183] = -87;
121            Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
122            Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
123            XML11CHARS[215] = 33;
124            Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
125            XML11CHARS[247] = 33;
126            Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
127            Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
128            Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
129            XML11CHARS[894] = 33;
130            Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
131            Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
132            Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
133            Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
134            XML11CHARS[8232] = 35;
135            Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
136            Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
137            Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
138            Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
139            Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
140            Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
141            Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
142            Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
143            Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
144            Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
145            Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
146            Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
147    
148        } // <clinit>()
149    
150        //
151        // Public static methods
152        //
153    
154        /**
155         * Returns true if the specified character is a space character
156         * as amdended in the XML 1.1 specification.
157         *
158         * @param c The character to check.
159         */
160        public static boolean isXML11Space(int c) {
161            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
162        } // isXML11Space(int):boolean
163    
164        /**
165         * Returns true if the specified character is valid. This method
166         * also checks the surrogate character range from 0x10000 to 0x10FFFF.
167         * <p>
168         * If the program chooses to apply the mask directly to the
169         * <code>XML11CHARS</code> array, then they are responsible for checking
170         * the surrogate character range.
171         *
172         * @param c The character to check.
173         */
174        public static boolean isXML11Valid(int c) {
175            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0) 
176                    || (0x10000 <= c && c <= 0x10FFFF);
177        } // isXML11Valid(int):boolean
178    
179        /**
180         * Returns true if the specified character is invalid.
181         *
182         * @param c The character to check.
183         */
184        public static boolean isXML11Invalid(int c) {
185            return !isXML11Valid(c);
186        } // isXML11Invalid(int):boolean
187    
188        /**
189         * Returns true if the specified character is valid and permitted outside
190         * of a character reference.  
191         * That is, this method will return false for the same set as
192         * isXML11Valid, except it also reports false for "control characters".
193         *
194         * @param c The character to check.
195         */
196        public static boolean isXML11ValidLiteral(int c) {
197            return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
198                || (0x10000 <= c && c <= 0x10FFFF)); 
199        } // isXML11ValidLiteral(int):boolean
200    
201        /**
202         * Returns true if the specified character can be considered 
203         * content in an external parsed entity.
204         *
205         * @param c The character to check.
206         */
207        public static boolean isXML11Content(int c) {
208            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
209                   (0x10000 <= c && c <= 0x10FFFF);
210        } // isXML11Content(int):boolean
211        
212        /**
213         * Returns true if the specified character can be considered 
214         * content in an internal parsed entity.
215         *
216         * @param c The character to check.
217         */
218        public static boolean isXML11InternalEntityContent(int c) {
219            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
220                   (0x10000 <= c && c <= 0x10FFFF);
221        } // isXML11InternalEntityContent(int):boolean
222    
223        /**
224         * Returns true if the specified character is a valid name start
225         * character as defined by production [4] in the XML 1.1
226         * specification.
227         *
228         * @param c The character to check.
229         */
230        public static boolean isXML11NameStart(int c) {
231            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
232                || (0x10000 <= c && c < 0xF0000);
233        } // isXML11NameStart(int):boolean
234    
235        /**
236         * Returns true if the specified character is a valid name
237         * character as defined by production [4a] in the XML 1.1
238         * specification.
239         *
240         * @param c The character to check.
241         */
242        public static boolean isXML11Name(int c) {
243            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0) 
244                || (c >= 0x10000 && c < 0xF0000);
245        } // isXML11Name(int):boolean
246    
247        /**
248         * Returns true if the specified character is a valid NCName start
249         * character as defined by production [4] in Namespaces in XML
250         * 1.1 recommendation.
251         *
252         * @param c The character to check.
253         */
254        public static boolean isXML11NCNameStart(int c) {
255            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
256                || (0x10000 <= c && c < 0xF0000);
257        } // isXML11NCNameStart(int):boolean
258    
259        /**
260         * Returns true if the specified character is a valid NCName
261         * character as defined by production [5] in Namespaces in XML
262         * 1.1 recommendation.
263         *
264         * @param c The character to check.
265         */
266        public static boolean isXML11NCName(int c) {
267            return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
268                || (0x10000 <= c && c < 0xF0000);
269        } // isXML11NCName(int):boolean
270        
271        /**
272         * Returns whether the given character is a valid 
273         * high surrogate for a name character. This includes
274         * all high surrogates for characters [0x10000-0xEFFFF].
275         * In other words everything excluding planes 15 and 16.
276         *
277         * @param c The character to check.
278         */
279        public static boolean isXML11NameHighSurrogate(int c) {
280            return (0xD800 <= c && c <= 0xDB7F);
281        }
282    
283        /*
284         * [5] Name ::= NameStartChar NameChar*
285         */
286        /**
287         * Check to see if a string is a valid Name according to [5]
288         * in the XML 1.1 Recommendation
289         *
290         * @param name string to check
291         * @return true if name is a valid Name
292         */
293        public static boolean isXML11ValidName(String name) {
294            int length = name.length();
295            if (length == 0)
296                return false;
297            int i = 1;
298            char ch = name.charAt(0);
299            if( !isXML11NameStart(ch) ) {
300                if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
301                    char ch2 = name.charAt(1);
302                    if ( !XMLChar.isLowSurrogate(ch2) || 
303                         !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
304                        return false;
305                    }
306                    i = 2;
307                }
308                else {
309                    return false;
310                }
311            }
312            while (i < length) {
313                ch = name.charAt(i);
314                if ( !isXML11Name(ch) ) {
315                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
316                        char ch2 = name.charAt(i);
317                        if ( !XMLChar.isLowSurrogate(ch2) || 
318                             !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
319                            return false;
320                        }
321                    }
322                    else {
323                        return false;
324                    }
325                }
326                ++i;
327            }
328            return true;
329        } // isXML11ValidName(String):boolean
330        
331    
332        /*
333         * from the namespace 1.1 rec
334         * [4] NCName ::= NCNameStartChar NCNameChar*
335         */
336        /**
337         * Check to see if a string is a valid NCName according to [4]
338         * from the XML Namespaces 1.1 Recommendation
339         *
340         * @param ncName string to check
341         * @return true if name is a valid NCName
342         */
343        public static boolean isXML11ValidNCName(String ncName) {
344            int length = ncName.length();
345            if (length == 0)
346                return false;
347            int i = 1;
348            char ch = ncName.charAt(0);
349            if( !isXML11NCNameStart(ch) ) {
350                if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
351                    char ch2 = ncName.charAt(1);
352                    if ( !XMLChar.isLowSurrogate(ch2) || 
353                         !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
354                        return false;
355                    }
356                    i = 2;
357                }
358                else {
359                    return false;
360                }
361            }
362            while (i < length) {
363                ch = ncName.charAt(i);
364                if ( !isXML11NCName(ch) ) {
365                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
366                        char ch2 = ncName.charAt(i);
367                        if ( !XMLChar.isLowSurrogate(ch2) || 
368                             !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
369                            return false;
370                        }
371                    }
372                    else {
373                        return false;
374                    }
375                }
376                ++i;
377            }
378            return true;
379        } // isXML11ValidNCName(String):boolean
380    
381        /*
382         * [7] Nmtoken ::= (NameChar)+
383         */
384        /**
385         * Check to see if a string is a valid Nmtoken according to [7]
386         * in the XML 1.1 Recommendation
387         *
388         * @param nmtoken string to check
389         * @return true if nmtoken is a valid Nmtoken 
390         */
391        public static boolean isXML11ValidNmtoken(String nmtoken) {
392            int length = nmtoken.length();
393            if (length == 0)
394                return false;
395            for (int i = 0; i < length; ++i ) {
396                char ch = nmtoken.charAt(i);
397                if( !isXML11Name(ch) ) {
398                    if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
399                        char ch2 = nmtoken.charAt(i);
400                        if ( !XMLChar.isLowSurrogate(ch2) || 
401                             !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
402                            return false;
403                        }
404                    }
405                    else {
406                        return false;
407                    }
408                }
409            }
410            return true;
411        } // isXML11ValidName(String):boolean
412    
413    } // class XML11Char
414