001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.xml; 016 017import com.google.common.annotations.Beta; 018import com.google.common.annotations.GwtCompatible; 019import com.google.common.escape.Escaper; 020import com.google.common.escape.Escapers; 021 022/** 023 * {@code Escaper} instances suitable for strings to be included in XML attribute values and 024 * elements' text contents. When possible, avoid manual escaping by using templating systems and 025 * high-level APIs that provide autoescaping. For example, consider <a 026 * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>. 027 * 028 * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters 029 * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape 030 * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal 031 * level of escaping to ensure that the output can be safely included in a Unicode XML document. 032 * 033 * <p>For details on the behavior of the escapers in this class, see sections <a 034 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a 035 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification. 036 * 037 * @author Alex Matevossian 038 * @author David Beaumont 039 * @since 15.0 040 */ 041@Beta 042@GwtCompatible 043@ElementTypesAreNonnullByDefault 044public class XmlEscapers { 045 private XmlEscapers() {} 046 047 private static final char MIN_ASCII_CONTROL_CHAR = 0x00; 048 private static final char MAX_ASCII_CONTROL_CHAR = 0x1F; 049 050 // For each xxxEscaper() method, please add links to external reference pages 051 // that are considered authoritative for the behavior of that escaper. 052 053 /** 054 * Returns an {@link Escaper} instance that escapes special characters in a string so it can 055 * safely be included in an XML document as element content. See section <a 056 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification. 057 * 058 * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this 059 * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in 060 * element content or {@link #xmlAttributeEscaper} in attribute values. 061 * 062 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the 063 * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more 064 * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of 065 * the XML specification. 066 * 067 * <p>This escaper does not escape non-ASCII characters to their numeric character references 068 * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output. 069 * Specifically "\r" (carriage return) is preserved in the output, which may result in it being 070 * silently converted to "\n" when the XML is parsed. 071 * 072 * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode 073 * validation on its input. 074 */ 075 public static Escaper xmlContentEscaper() { 076 return XML_CONTENT_ESCAPER; 077 } 078 079 /** 080 * Returns an {@link Escaper} instance that escapes special characters in a string so it can 081 * safely be included in XML document as an attribute value. See section <a 082 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML 083 * specification. 084 * 085 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the 086 * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more 087 * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of 088 * the XML specification. 089 * 090 * <p>This escaper does not escape non-ASCII characters to their numeric character references 091 * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code 092 * '\r'} are escaped to a corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"} 093 * respectively. Any other non-ASCII characters appearing in the input will be preserved in the 094 * output. 095 * 096 * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode 097 * validation on its input. 098 */ 099 public static Escaper xmlAttributeEscaper() { 100 return XML_ATTRIBUTE_ESCAPER; 101 } 102 103 private static final Escaper XML_ESCAPER; 104 private static final Escaper XML_CONTENT_ESCAPER; 105 private static final Escaper XML_ATTRIBUTE_ESCAPER; 106 107 static { 108 Escapers.Builder builder = Escapers.builder(); 109 // The char values \uFFFE and \uFFFF are explicitly not allowed in XML 110 // (Unicode code points above \uFFFF are represented via surrogate pairs 111 // which means they are treated as pairs of safe characters). 112 builder.setSafeRange(Character.MIN_VALUE, '\uFFFD'); 113 // Unsafe characters are replaced with the Unicode replacement character. 114 builder.setUnsafeReplacement("\uFFFD"); 115 116 /* 117 * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode 118 * replacement character. 119 * 120 * Implementation note: An alternative to the following would be to make a map that simply 121 * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe 122 * character to 0x20. However this would slow down the escaping of simple strings that contain 123 * \t, \n, or \r. 124 */ 125 for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) { 126 if (c != '\t' && c != '\n' && c != '\r') { 127 builder.addEscape(c, "\uFFFD"); 128 } 129 } 130 131 // Build the content escaper first and then add quote escaping for the 132 // general escaper. 133 builder.addEscape('&', "&"); 134 builder.addEscape('<', "<"); 135 builder.addEscape('>', ">"); 136 XML_CONTENT_ESCAPER = builder.build(); 137 builder.addEscape('\'', "'"); 138 builder.addEscape('"', """); 139 XML_ESCAPER = builder.build(); 140 builder.addEscape('\t', "	"); 141 builder.addEscape('\n', "
"); 142 builder.addEscape('\r', "
"); 143 XML_ATTRIBUTE_ESCAPER = builder.build(); 144 } 145}