CleanXmlHelper.java
| Index Score | ||
|---|---|---|
![]() |
![]() |
org.pentaho.core.util |
![]() |
![]() |
Pentaho |
View: Reasons, Metrics, Source Code
These are the metrics that contribute to the Enerjy Score for this file, ranked by impact. So the metrics listed at the top influence the score to a greater extent that the metrics listed at the bottom.
/*
* Copyright 2007 Pentaho Corporation. All rights reserved.
* This software was developed by Pentaho Corporation and is provided under the terms
* of the Mozilla Public License, Version 1.1, or any later version. You may not use
* this file except in compliance with the license. If you need a copy of the license,
* please go to http://www.mozilla.org/MPL/MPL-1.1.txt. The Original Code is the Pentaho
* BI Platform. The Initial Developer is Pentaho Corporation.
*
* Software distributed under the Mozilla Public License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. Please refer to
* the license for the specific language governing your rights and limitations.
*/
package org.pentaho.core.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.URIResolver;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.pentaho.messages.Messages;
import org.xml.sax.EntityResolver;
// TODO sbarkdull, exernalize strings, comment methods
/**
* A set of static methods to help in with: * the construction of XML DOM
* Documents (org.dom4j.Document) from files, streams, and Strings * in the
* creation of XML DOM Documents as the result of an XSLT transform * persisting
* of XML DOM documents to the file system or a <code>Writer</code>. * the
* encoding of a String of Xml text
*
* Design notes: This class should never have any dependencies (i.e. imports) on
* anything on org.pentaho or com.pentaho or their decendant packages. In
* general, methods in the class should not attempt to handle exceptions, but
* should let the exceptions propogate to the caller to be handled there. Please
* do not use european-reuse in this class. One of the primary design goals for
* this class was to construct it in a way that it could be used without change
* outside of the Pentaho platform. Related XML-helper type code that is
* dependant on the platform should be moved "up" to XmlHelper.
*/
public class CleanXmlHelper {
private static final Log logger = LogFactory.getLog(CleanXmlHelper.class);
/*
* this regular expression pattern should match the value of the encoding
* pseudo-attribute in the xml processing instruction in an xml document. e.g.
* <?xml version="1.0" encoding="UTF-8" ?>
*/
private static final Pattern RE_ENCODING = Pattern.compile(
"<\\?xml.*encoding=('|\")([^'\"]*)\\1.*\\?>.*", Pattern.DOTALL); //$NON-NLS-1$
/**
* Should never be called.
*
*/
protected CleanXmlHelper() {
}
/**
* Create a <code>Document</code> from <code>str</code>.
*
* @param str
* String containing the XML that will be used to create the Document
* @param encoding
* String specifying the character encoding. The encoding of the xml
* String can be discovered by calling CleanXmlHelper.getEncoding().
* @param resolver EntityResolver an instance of an EntityResolver that will resolve
* any external URIs. See the docs on EntityResolver. null is an acceptable value.
* @return <code>Document</code> initialized with the xml in
* <code>strXml</code>.
* @throws DocumentException
*/
public static Document getDocFromString(String strXml, String encoding, EntityResolver resolver)
throws DocumentException {
byte[] bytes = null;
Document document = null;
InputStream inStrm = null;
try {
if ( null != encoding )
{
bytes = strXml.getBytes(encoding);
}
else
{
// doh, we don't know the encoding, cross your fingers and hope for the best
bytes = strXml.getBytes();
}
inStrm = new ByteArrayInputStream( bytes );
document = getDocFromStream(inStrm, encoding, resolver);
} catch (UnsupportedEncodingException e) {
if (logger.isErrorEnabled()) {
logger.error(e);
}
} finally {
closeInputStream(inStrm);
}
return document;
}
/**
* Create a <code>Document</code> from <code>str</code>.
*
* @param str
* String containing the XML that will be used to create the Document
* can be discovered by calling CleanXmlHelper.getEncoding().
* @param resolver EntityResolver an instance of an EntityResolver that will resolve
* any external URIs. See the docs on EntityResolver. null is an acceptable value.
*
* @return <code>Document</code> initialized with the xml in
* <code>strXml</code>.
* @throws DocumentException
*/
public static Document getDocFromString(String strXml, EntityResolver resolver) throws DocumentException {
String encoding = getEncoding(strXml);
return getDocFromString(strXml, encoding, resolver);
}
/**
* Create a <code>Document</code> from the contents of a file.
*
* @param path
* String containing the path to the file containing XML that will be
* used to create the Document.
* @param resolver EntityResolver an instance of an EntityResolver that will resolve
* any external URIs. See the docs on EntityResolver. null is an acceptable value.
* @return <code>Document</code> initialized with the xml in
* <code>strXml</code>.
* @throws DocumentException
* if the document isn't valid
* @throws IOException
*/
public static Document getDocFromFile(String path, EntityResolver resolver) throws DocumentException, IOException {
File file = new File(path);
return getDocFromFile(file, resolver);
}
/**
* Create a <code>Document</code> from the contents of a file.
*
* @param path
* String containing the path to the file containing XML that will be
* used to create the Document.
* @param resolver EntityResolver an instance of an EntityResolver that will resolve
* any external URIs. See the docs on EntityResolver. null is an acceptable value.
* @return <code>Document</code> initialized with the xml in
* <code>strXml</code>.
* @throws DocumentException
* if the document isn't valid
* @throws IOException
* if the file doesn't exist
*/
public static Document getDocFromFile(File file, EntityResolver resolver) throws DocumentException, IOException {
InputStream fInStrm = null;
Document document = null;
try {
String encoding = getEncoding(file);
fInStrm = new FileInputStream(file);
document = getDocFromStream( fInStrm, encoding, resolver );
} finally {
closeInputStream(fInStrm);
}
return document;
}
/**
* Create a <code>Document</code> from the contents of an input stream,
* where the input stream contains valid XML.
*
* NOTE: the method Document getDocFromStream(InputStream inStream) should
* be preferred over this method since it examines the XML for the encoding
* specified in the processing instruction. This relieves the caller of the
* burden of discovering the encoding, but also assumes that the encoding
* in the XML is specified properly.
*
* @param inStream InputStream
* input stream to read the XML from
* @param encoding String the character encoding of the bytes in <param>inStream</param>. For instance: UTF-8
* Can be null.
* @param resolver EntityResolver an instance of an EntityResolver that will resolve
* any external URIs. See the docs on EntityResolver. null is an acceptable value.
* @return <code>Document</code> initialized with the xml in
* <code>strXml</code>.
* @throws DocumentException
* if the document isn't valid
* @throws FileNotFoundException
* if the file doesn't exist
*/
private static Document getDocFromStream(InputStream inStream, String encoding, EntityResolver resolver) throws DocumentException {
SAXReader rdr = new SAXReader();
if ( null != encoding )
{
rdr.setEncoding( encoding );
}
if (null != resolver) {
rdr.setEntityResolver(resolver);
}
Document document = rdr.read(inStream);
return document;
}
/**
* Create a <code>Document</code> from the contents of an input stream,
* where the input stream contains valid XML.
*
* @param inStream
* @return
* @throws DocumentException
* @throws IOException
*/
public static Document getDocFromStream(InputStream inStream, EntityResolver resolver) throws DocumentException, IOException {
String encoding = getEncoding( inStream );
return getDocFromStream(inStream, encoding, resolver);
}
/**
* Create a <code>Document</code> from the contents of an input stream,
* where the input stream contains valid XML.
*
* @param inStream
* @return
* @throws DocumentException
* @throws IOException
*/
public static Document getDocFromStream(InputStream inStream) throws DocumentException, IOException {
return getDocFromStream(inStream, null);
}
/**
* WARNING: if the <param>inStream</param> instance does not support mark/reset,
* when this method returns, subsequent reads on <param>inStream</param> will be
* 256 bytes into the stream. This may not be the expected behavior. FileInputStreams
* are an example of an InputStream that does not support mark/reset. InputStreams
* that do support mark/reset will be reset to the beginning of the stream
* when this method returns.
*
* @param inStream
* @return
* @throws IOException
*/
private static String readEncodingProcessingInstruction( InputStream inStream ) throws IOException
{
final int BUFF_SZ = 256;
if ( inStream.markSupported() )
{
inStream.mark(BUFF_SZ+1); // BUFF_SZ+1 forces mark to NOT be forgotten
}
byte[] buf = new byte[ BUFF_SZ ];
int totalBytesRead = 0;
int bytesRead;
do {
bytesRead = inStream.read( buf, totalBytesRead, BUFF_SZ-totalBytesRead );
if ( bytesRead == -1 )
{
break;
}
totalBytesRead += bytesRead;
} while ( totalBytesRead < BUFF_SZ );
if ( inStream.markSupported() )
{
inStream.reset();
}
return new String( buf );
}
/**
* Use the transform specified by xslSrc and transform the document specified
* by docSrc, and return the resulting document.
*
* @param xslSrc
* StreamSrc containing the xsl transform
* @param docSrc
* StreamSrc containing the document to be transformed
* @param params
* Map of properties to set on the transform
* @param resolver
* URIResolver instance to resolve URI's in the output document.
*
* @return StringBuffer containing the XML results of the transform
* @throws TransformerConfigurationException
* if the TransformerFactory fails to create a Transformer.
* @throws TransformerException
* if actual transform fails.
*/
protected static final StringBuffer transformXml(StreamSource xslSrc, StreamSource docSrc, Map params,
URIResolver resolver) throws TransformerConfigurationException, TransformerException {
StringBuffer sb = null;
StringWriter writer = new StringWriter();
TransformerFactory tf = TransformerFactory.newInstance();
if (null != resolver) {
tf.setURIResolver(resolver);
}
// TODO need to look into compiling the XSLs...
Transformer t = tf.newTransformer(xslSrc); // can throw
// TransformerConfigurationException
// Start the transformation
if (params != null) {
Set keys = params.keySet();
Iterator it = keys.iterator();
String key, val;
while (it.hasNext()) {
key = (String) it.next();
val = (String) params.get(key);
if (val != null) {
t.setParameter(key, val);
}
}
}
t.transform(docSrc, new StreamResult(writer)); // can throw
// TransformerException
sb = writer.getBuffer();
return sb;
}
/**
* Use the transform specified by xslSrc and transform the document specified
* by docSrc, and return the resulting document.
*
* @param xslInStream
* InputStream containing the xsl transform
* @param docInStrm
* InputStream containing the document to be transformed
* @param params
* Map of properties to set on the transform
* @param resolver
* URIResolver instance to resolve URI's in the output document.
*
* @return StringBuffer containing the XML results of the transform
* @throws TransformerConfigurationException
* if the TransformerFactory fails to create a Transformer.
* @throws TransformerException
* if actual transform fails.
*/
public static final StringBuffer transformXml(InputStream xslInStream, InputStream docInStrm, Map params,
URIResolver resolver) throws TransformerConfigurationException, TransformerException {
StreamSource xslSrc = new StreamSource(xslInStream);
StreamSource docSrc = new StreamSource(docInStrm);
return transformXml(xslSrc, docSrc, params, resolver);
}
/**
* Convert a W3C Document to a String.
*
* Note: if you are working with a dom4j Document, you can use it's asXml()
* method.
*
* @param doc
* org.w3c.dom.Document to be converted to a String.
* @return String representing the XML document.
*
* @throws TransformerConfigurationException
* If unable to get an instance of a Transformer
* @throws TransformerException
* If the attempt to transform the document fails.
*/
public static final StringBuffer docToString(org.w3c.dom.Document doc) throws TransformerConfigurationException,
TransformerException {
StringBuffer sb = null;
StringWriter writer = new StringWriter();
TransformerFactory tf = TransformerFactory.newInstance();
Transformer t = tf.newTransformer(); // can throw
// TransformerConfigurationException
Source docSrc = new DOMSource(doc);
t.transform(docSrc, new StreamResult(writer)); // can throw
// TransformerException
sb = writer.getBuffer();
return sb;
}
/**
* Find the character encoding specification in the xml String. If it exists,
* return the character encoding. Otherwise, return null.
*
* @param xml
* String containing the xml
* @return String containing the character encoding in the xml processing
* instruction if it exists, else null.
*/
public static String getEncoding(String xml) {
Matcher m = RE_ENCODING.matcher(xml);
boolean bMatches = m.matches();
if (bMatches && (m.groupCount() == 2)) {
return m.group(2);
}
// no encoding found
return null;
}
private static final int BUFF_SIZE = 512;
public static String getEncoding(File f) throws IOException {
char[] cbuf = new char[BUFF_SIZE];
Reader rdr = null;
try {
rdr = new FileReader(f);
rdr.read(cbuf);
} finally {
rdr.close();
}
String strEnc = String.valueOf(cbuf);
return getEncoding(strEnc);
}
public static String getEncoding( InputStream inStream ) throws IOException {
String encodingPI = readEncodingProcessingInstruction( inStream );
return getEncoding( encodingPI );
}
// TODO sbarkdull, this code is duplicated in LocaleHelper
/**
* convert any character in the XML input (<code>rawValue</code>) whose
* code position is greater than or equal to 0x080 to its Numeric Character
* Reference. For a description of Numeric Character References see:
* http://www.w3.org/TR/html4/charset.html#h-5.3.1
*
* @param rawValue
* String containing the XML to be encoded.
* @return String containing the encoded XML
*/
public static String getXmlEncodedString(String rawValue) {
StringBuffer value = new StringBuffer();
for (int n = 0; n < rawValue.length(); n++) {
int charValue = rawValue.charAt(n);
if (charValue >= 0x80) {
value.append("&#x"); //$NON-NLS-1$
value.append(Integer.toString(charValue, 0x10));
value.append(";"); //$NON-NLS-1$
} else {
value.append((char) charValue);
}
}
return value.toString();
}
/**
* Write an XML document to a file using the specified character encoding.
*
* @param doc
* Document to be written
* @param filePath
* path identifying the File that will be the output of the Document
* @param encoding
* String specifying the character encoding. Can be null, in which
* case the default encoding will be used. See
* http://java.sun.com/j2se/1.5.0/docs/api/java/io/OutputStreamWriter.html
* @throws IOException
* if unable to obtain a FileWriter on the specified file
*/
public static void saveDomToFile(Document doc, String filePath, String encoding) throws IOException {
File file = new File(filePath);
saveDomToFile(doc, file, encoding);
}
/**
* Write an XML document to a file using the specified character encoding.
*
* @param doc
* Document to be written
* @param file
* File that will be the output of the Document
* @param encoding
* String specifying the character encoding. Can be null, in which
* case the default encoding will be used. See
* http://java.sun.com/j2se/1.5.0/docs/api/java/io/OutputStreamWriter.html
* @throws IOException
* if unable to obtain a FileWriter on the specified file
*/
public static void saveDomToFile(Document doc, File file, String encoding) throws IOException {
Writer fWriter = null;
if (null != encoding) {
fWriter = new OutputStreamWriter(new FileOutputStream(file), encoding);
} else {
fWriter = new OutputStreamWriter(new FileOutputStream(file));
}
saveDomToWriter(doc, fWriter);
}
public static void saveDomToWriter(Document doc, Writer writer) throws IOException {
writer.write(doc.asXML());
}
/**
*
* @param version
* @param encoding
* @return String Xml Processing instruction text with the specified version (usually 1.0)
* and encoding (for instance, UTF-8)
*/
public static String createXmlProcessingInstruction( String version, String encoding )
{
return "<?xml version=\"" + version + "\" encoding=\"" + encoding + "\" ?>"; //$NON-NLS-1$ //$NON-NLS-3$
}
// TODO sbarkdull, move to junit test class
public static void main(String[] args) {
String strXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root><b>first</b><b>second</b></root>"; //$NON-NLS-1$
try {
Document d = CleanXmlHelper.getDocFromString(strXml, null);
String enc = d.getXMLEncoding();
logger.debug( "encoding: " + enc ); //$NON-NLS-1$
} catch (DocumentException e3) {
// TODO Auto-generated catch block
logger.error(null, e3);
}
String defaultEncoding = (new OutputStreamWriter(new ByteArrayOutputStream())).getEncoding();
Charset cs = Charset.defaultCharset();
logger.info( "default Char set: " + cs.name() + " " + defaultEncoding ); //$NON-NLS-1$//$NON-NLS-2$
ByteArrayInputStream s = new ByteArrayInputStream(strXml.getBytes() );
try {
// must be repeatable
for ( int ii=0; ii<5; ++ii )
{
String pi = CleanXmlHelper.readEncodingProcessingInstruction( s );
String encoding = CleanXmlHelper.getEncoding( pi );
logger.debug( "encoding: " + encoding ); //$NON-NLS-1$
}
s.close();
} catch (IOException e2) {
logger.error(null, e2);
}
try {
Document doc = CleanXmlHelper.getDocFromString( strXml, null ); //$NON-NLS-1$
Node n = doc.selectSingleNode( "/root/b[text()='first']" ); //$NON-NLS-1$
logger.debug( n.getText() );
} catch (DocumentException e) {
logger.error(null, e);
System.exit(1);
}
/*
char[] cbuf = new char[ BUFF_SIZE ];
File f = null;
Reader rdr = null;
try
{
f = new File( "C:\\projects\\pentaho1.6\\pentaho-solutions\\system\\pentaho.xml" );
rdr = new FileReader( f );
rdr.read(cbuf);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
finally
{
try {
rdr.close();
}catch( Exception ignore){}
}
String strEnc = String.valueOf( cbuf );
//strEnc = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<pentaho-system></pentaho-system>";
//strEnc = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n";
String re = ".*";
re = "<\\?xml.*encoding=('|\")([^'\"]*)\\1.*\\?>.*";
boolean b = strEnc.matches( re );
Pattern p = Pattern.compile( re, Pattern.DOTALL );
Matcher m = p.matcher(strEnc);
boolean matches = m.matches();
for ( int ii=0; ii<=m.groupCount(); ++ii )
{
System.out.println( m.group( ii ) );
}
String enc = null;
try {
enc = getEncoding( f );
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
*/
String[] xmls = { "", //$NON-NLS-1$
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<pentaho-system></pentaho-system>", //$NON-NLS-1$
"<?xml version=\"1.0\" encoding=\"windows-1252\"?>\n<root></root>", //$NON-NLS-1$
"<?xml encoding=\"UTF-8\" version=\"1.0\"?><root></root>", //$NON-NLS-1$
"<?xml encoding=\"UTF-8\" version='1.0'?><root></root>", //$NON-NLS-1$
"<?xml encoding='UTF-8' version=\"1.0\"?><root></root>", //$NON-NLS-1$
"<?xml encoding='UTF-8' version='1.0'?><root></root>", //$NON-NLS-1$
"<?xml encoding='UTF-8\" version='1.0'?><root></root>", //$NON-NLS-1$
"<?xml version=\"1.0\"?><root></root>", //$NON-NLS-1$
"bart simpson was here", //$NON-NLS-1$
"<root>encoding=bad</root>" //$NON-NLS-1$
};
for (int ii = 0; ii < xmls.length; ++ii) {
String enc = getEncoding(xmls[ii]);
logger.trace("2xml: {0} enc: {1}" + xmls[ii] + " enc: " + enc); //$NON-NLS-1$ //$NON-NLS-2$
enc = ""; //$NON-NLS-1$
}
// performance test
final int numTries = 10000;
// big file
String nm = "C:\\projects\\pentaho\\pentaho-reportwizard\\samples\\data\\ClassicCars.xml"; //$NON-NLS-1$
nm = "C:\\projects\\pentaho1.6\\pentaho-solutions\\system\\pentaho.xml";//$NON-NLS-1$
// small file
// String nm = "C:\\projects\\pentaho\\my-solutions\\index.xml";
String xml = null;
try {
xml = getDocFromFile(nm, null).asXML();
} catch (DocumentException de) {
logger.error(null, de);
} catch (IOException ioe) {
logger.error(null, ioe);
}
// let's run a test to see if getEncoding is reasonably fast
long start0 = System.currentTimeMillis();
for (int ii = 0; ii < numTries; ++ii) {
getEncoding(xml);
}
long end0 = System.currentTimeMillis();
logger.info("time: " + (end0 - start0)); //$NON-NLS-1$
long start1 = System.currentTimeMillis();
for (int ii = 0; ii < numTries; ++ii) {
for (int jj = 0; jj < xml.length(); ++jj) {
xml.charAt(jj);
}
}
long end1 = System.currentTimeMillis();
logger.info("time: " + (end1 - start1)); //$NON-NLS-1$
// end reasonably fast test
}
/**
* Convenience method to close an input stream and handle (log and throw away)
* any exceptions. Helps keep code uncluttered.
*
* @param strm
* InputStream to be closed
*/
protected static void closeInputStream(InputStream strm) {
if (null != strm) {
try {
strm.close();
} catch (IOException e) {
logger.warn(Messages.getString("CleanXmlHelper.WARN_INPUT_STREAM_NOT_CLOSED"), e); //$NON-NLS-1$
}
}
}
}
The table below shows all metrics for CleanXmlHelper.java.




