View Javadoc

1   /*
2    * Copyright 2005 John G. Wilson
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  
18  package groovy.util;
19  
20  import groovy.util.slurpersupport.GPathResult;
21  import groovy.util.slurpersupport.Node;
22  import groovy.util.slurpersupport.NodeChild;
23  import groovy.xml.FactorySupport;
24  
25  import java.io.File;
26  import java.io.FileInputStream;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.Reader;
30  import java.io.StringReader;
31  import java.net.URL;
32  import java.util.HashMap;
33  import java.util.Hashtable;
34  import java.util.Map;
35  import java.util.Stack;
36  
37  import javax.xml.parsers.ParserConfigurationException;
38  import javax.xml.parsers.SAXParser;
39  import javax.xml.parsers.SAXParserFactory;
40  
41  import org.xml.sax.Attributes;
42  import org.xml.sax.DTDHandler;
43  import org.xml.sax.EntityResolver;
44  import org.xml.sax.ErrorHandler;
45  import org.xml.sax.InputSource;
46  import org.xml.sax.SAXException;
47  import org.xml.sax.SAXNotRecognizedException;
48  import org.xml.sax.SAXNotSupportedException;
49  import org.xml.sax.XMLReader;
50  import org.xml.sax.helpers.DefaultHandler;
51  
52  /***
53   * @author John Wilson
54   *
55   */
56  
57  public class XmlSlurper extends DefaultHandler {
58    private final XMLReader reader;
59    private Node currentNode = null;
60    private final Stack stack = new Stack();
61    private final StringBuffer charBuffer = new StringBuffer();
62    private final Map namespaceTagHints = new Hashtable();
63    private boolean keepWhitespace = false;
64  
65    public XmlSlurper() throws ParserConfigurationException, SAXException {
66      this(false, true);
67    }
68    
69    public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
70      SAXParserFactory factory = FactorySupport.createSaxParserFactory();
71      factory.setNamespaceAware(namespaceAware);
72      factory.setValidating(validating);
73      this.reader = factory.newSAXParser().getXMLReader();
74    }
75    
76    public XmlSlurper(final XMLReader reader) {
77      this.reader = reader;
78    }
79    
80    public XmlSlurper(final SAXParser parser) throws SAXException {
81      this(parser.getXMLReader());
82    }
83    
84    /***
85     * @param keepWhitespace
86     * 
87     * If true then whitespace before elements is kept.
88     * The deafult is to discard the whitespace.
89     */
90    public void setKeepWhitespace(boolean keepWhitespace) {
91        this.keepWhitespace = keepWhitespace;
92    }
93    
94    /***
95     * @return The GPathResult instance created by consuming a stream of SAX events
96     * Note if one of the parse methods has been called then this returns null
97     * Note if this is called more than once all calls after the first will return null
98     *
99     */
100   public GPathResult getDocument() {
101     try {
102       return new NodeChild(this.currentNode, null, this.namespaceTagHints);
103     } finally {
104       this.currentNode = null;
105     }
106   }
107   
108   /***
109    * Parse the content of the specified input source into a GPathResult object
110    * 
111    * @param input
112    * @return An object which supports GPath expressions
113    * @throws IOException
114    * @throws SAXException
115    */
116   public GPathResult parse(final InputSource input) throws IOException, SAXException {
117     this.reader.setContentHandler(this);
118     this.reader.parse(input);
119     
120     return getDocument();
121     
122   }
123   
124   /***
125    * Parses the content of the given file as XML turning it into a GPathResult object
126    * 
127    * @param file
128    * @return An object which supports GPath expressions
129    * @throws IOException
130    * @throws SAXException
131    */
132   public GPathResult parse(final File file) throws IOException, SAXException {
133   final InputSource input = new InputSource(new FileInputStream(file));
134     
135     input.setSystemId("file://" + file.getAbsolutePath());
136     
137     return parse(input);
138     
139   }
140   
141   /***
142    * Parse the content of the specified input stream into an GPathResult Object.
143    * Note that using this method will not provide the parser with any URI
144    * for which to find DTDs etc
145    * 
146    * @param input
147    * @return An object which supports GPath expressions
148    * @throws IOException
149    * @throws SAXException
150    */
151   public GPathResult parse(final InputStream input) throws IOException, SAXException {
152     return parse(new InputSource(input));
153   }
154   
155   /***
156    * Parse the content of the specified reader into a GPathResult Object.
157    * Note that using this method will not provide the parser with any URI
158    * for which to find DTDs etc
159    * 
160    * @param in
161    * @return An object which supports GPath expressions
162    * @throws IOException
163    * @throws SAXException
164    */
165   public GPathResult parse(final Reader in) throws IOException, SAXException {
166     return parse(new InputSource(in));
167   }
168   
169   /***
170    * Parse the content of the specified URI into a GPathResult Object
171    * 
172    * @param uri
173    * @return An object which supports GPath expressions
174    * @throws IOException
175    * @throws SAXException
176    */
177   public GPathResult parse(final String uri) throws IOException, SAXException {
178     return parse(new InputSource(uri));
179   }
180   
181   /***
182    * A helper method to parse the given text as XML
183    * 
184    * @param text
185    * @return An object which supports GPath expressions
186    */
187   public GPathResult parseText(final String text) throws IOException, SAXException {
188     return parse(new StringReader(text));
189   }
190   
191   // Delegated XMLReader methods
192   //------------------------------------------------------------------------
193 
194   /* (non-Javadoc)
195    * @see org.xml.sax.XMLReader#getDTDHandler()
196    */
197   public DTDHandler getDTDHandler() {
198       return this.reader.getDTDHandler();
199   }
200 
201   /* (non-Javadoc)
202    * @see org.xml.sax.XMLReader#getEntityResolver()
203    */
204   public EntityResolver getEntityResolver() {
205       return this.reader.getEntityResolver();
206   }
207 
208   /* (non-Javadoc)
209    * @see org.xml.sax.XMLReader#getErrorHandler()
210    */
211   public ErrorHandler getErrorHandler() {
212       return this.reader.getErrorHandler();
213   }
214 
215   /* (non-Javadoc)
216    * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
217    */
218   public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
219       return this.reader.getFeature(uri);
220   }
221 
222   /* (non-Javadoc)
223    * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
224    */
225   public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
226       return this.reader.getProperty(uri);
227   }
228 
229   /* (non-Javadoc)
230    * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
231    */
232   public void setDTDHandler(final DTDHandler dtdHandler) {
233       this.reader.setDTDHandler(dtdHandler);
234   }
235 
236   /* (non-Javadoc)
237    * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
238    */
239   public void setEntityResolver(final EntityResolver entityResolver) {
240       this.reader.setEntityResolver(entityResolver);
241   }
242 
243   /***
244    * Resolves entities against using the suppied URL as the base for relative URLs
245    * 
246    * @param base
247    * The URL used to resolve relative URLs
248    */
249   public void setEntityBaseUrl(final URL base) {
250       this.reader.setEntityResolver(new EntityResolver() {
251           public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
252               return new InputSource(new URL(base, systemId).openStream());
253           }
254       });
255   }
256 
257   /* (non-Javadoc)
258    * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
259    */
260   public void setErrorHandler(final ErrorHandler errorHandler) {
261       this.reader.setErrorHandler(errorHandler);
262   }
263 
264   /* (non-Javadoc)
265    * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
266    */
267   public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
268       this.reader.setFeature(uri, value);
269   }
270 
271   /* (non-Javadoc)
272    * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
273    */
274   public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
275        this.reader.setProperty(uri, value);
276   }
277   
278   
279   // ContentHandler interface
280   //-------------------------------------------------------------------------                    
281   
282   /* (non-Javadoc)
283    * @see org.xml.sax.ContentHandler#startDocument()
284    */
285   public void startDocument() throws SAXException {
286     this.currentNode = null;
287     this.charBuffer.setLength(0);
288   }
289   
290   /* (non-Javadoc)
291    * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
292    */
293   public void startPrefixMapping(final String tag, final String uri) throws SAXException {
294     this.namespaceTagHints.put(tag, uri);
295   }
296 
297   /* (non-Javadoc)
298    * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
299    */
300   public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
301     addCdata();
302     
303     final Map attributes = new HashMap();
304     final Map attributeNamespaces = new HashMap();
305     
306     for (int i = atts.getLength() - 1; i != -1; i--) {
307       if (atts.getURI(i).length() == 0) {
308         attributes.put(atts.getQName(i), atts.getValue(i));
309       } else {
310         attributes.put(atts.getLocalName(i), atts.getValue(i));
311         attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i));
312       }
313       
314     }
315     
316     final Node newElement;
317     
318     if (namespaceURI.length() == 0){
319       newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI);
320     } else {
321       newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI);
322     }
323     
324     if (this.currentNode != null) {
325       this.currentNode.addChild(newElement);
326     }
327     
328     this.stack.push(this.currentNode);
329     this.currentNode = newElement;
330   }
331   
332   /* (non-Javadoc)
333    * @see org.xml.sax.ContentHandler#characters(char[], int, int)
334    */
335   public void characters(final char[] ch, final int start, final int length) throws SAXException {
336     this.charBuffer.append(ch, start, length);
337   }
338   
339   /* (non-Javadoc)
340    * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
341    */
342   public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
343     addCdata();
344     
345     final Object oldCurrentNode = this.stack.pop();
346     
347     if (oldCurrentNode != null) {
348       this.currentNode = (Node)oldCurrentNode;
349     }
350   }
351   
352   /* (non-Javadoc)
353    * @see org.xml.sax.ContentHandler#endDocument()
354    */
355   public void endDocument() throws SAXException {
356   }
357   
358   // Implementation methods
359   //-------------------------------------------------------------------------           
360   
361   /***
362    * 
363    */
364   private void addCdata() {
365     if (this.charBuffer.length() != 0) {
366       //
367       // This element is preceeded by CDATA if keepWhitespace is false (the default setting) and 
368       // it's not whitespace add it to the body
369       // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
370       // but for the sort of work I'm doing ignoring the whitespace is preferable
371       //
372       final String cdata = this.charBuffer.toString();
373       
374       this.charBuffer.setLength(0);
375       if (this.keepWhitespace || cdata.trim().length() != 0) {
376         this.currentNode.addChild(cdata);
377       }
378     }   
379   }
380 }