1   /**
2    * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3    *
4    * The contents of this file are subject to the terms of the Liferay Enterprise
5    * Subscription License ("License"). You may not use this file except in
6    * compliance with the License. You can obtain a copy of the License by
7    * contacting Liferay, Inc. See the License for the specific language governing
8    * permissions and limitations under the License, including but not limited to
9    * distribution rights of the Software.
10   *
11   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17   * SOFTWARE.
18   */
19  
20  package com.liferay.portal.util;
21  
22  import au.id.jericho.lib.html.Source;
23  
24  import com.liferay.portal.kernel.util.Html;
25  import com.liferay.portal.kernel.util.StringPool;
26  import com.liferay.portal.kernel.util.StringUtil;
27  
28  /**
29   * <a href="HtmlImpl.java.html"><b><i>View Source</i></b></a>
30   *
31   * @author Brian Wing Shun Chan
32   * @author Clarence Shen
33   * @author Harry Mark
34   *
35   */
36  public class HtmlImpl implements Html {
37  
38      public String escape(String text) {
39          if (text == null) {
40              return null;
41          }
42  
43          // Escape using XSS recommendations from
44          // http://www.owasp.org/index.php/Cross_Site_Scripting
45          // #How_to_Protect_Yourself
46  
47          StringBuilder sb = new StringBuilder(text.length());
48  
49          for (int i = 0; i < text.length(); i++) {
50              char c = text.charAt(i);
51  
52              switch (c) {
53                  case '<':
54                      sb.append("&lt;");
55  
56                      break;
57  
58                  case '>':
59                      sb.append("&gt;");
60  
61                      break;
62  
63                  case '&':
64                      sb.append("&amp;");
65  
66                      break;
67  
68                  case '"':
69                      sb.append("&#034;");
70  
71                      break;
72  
73                  case '\'':
74                      sb.append("&#039;");
75  
76                      break;
77  
78                  case '(':
79                      sb.append("&#040;");
80  
81                      break;
82  
83                  case ')':
84                      sb.append("&#041;");
85  
86                      break;
87  
88                  case '#':
89                      sb.append("&#035;");
90  
91                      break;
92  
93                  case '%':
94                      sb.append("&#037;");
95  
96                      break;
97  
98                  case ';':
99                      sb.append("&#059;");
100 
101                     break;
102 
103                 case '+':
104                     sb.append("&#043;");
105 
106                     break;
107 
108                 case '-':
109                     sb.append("&#045;");
110 
111                     break;
112 
113                 default:
114                     sb.append(c);
115 
116                     break;
117             }
118         }
119 
120         return sb.toString();
121     }
122 
123     public String extractText(String html) {
124         if (html == null) {
125             return null;
126         }
127 
128         Source source = new Source(html);
129 
130         return source.getTextExtractor().toString();
131     }
132 
133     public String fromInputSafe(String text) {
134         return StringUtil.replace(text, "&amp;", "&");
135     }
136 
137     public String replaceMsWordCharacters(String text) {
138         return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
139     }
140 
141     public String stripBetween(String text, String tag) {
142         return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
143     }
144 
145     public String stripComments(String text) {
146         return StringUtil.stripBetween(text, "<!--", "-->");
147     }
148 
149     public String stripHtml(String text) {
150         if (text == null) {
151             return null;
152         }
153 
154         text = stripComments(text);
155 
156         StringBuilder sb = new StringBuilder(text.length());
157 
158         int x = 0;
159         int y = text.indexOf("<");
160 
161         while (y != -1) {
162             sb.append(text.substring(x, y));
163             sb.append(StringPool.SPACE);
164 
165             // Look for text enclosed by <script></script>
166 
167             boolean scriptFound = isScriptTag(text, y + 1);
168 
169             if (scriptFound) {
170                 int pos = y + _TAG_SCRIPT.length;
171 
172                 // Find end of the tag
173 
174                 pos = text.indexOf(">", pos);
175 
176                 if (pos >= 0) {
177 
178                     // Check if preceding character is / (i.e. is this instance
179                     // of <script/>)
180 
181                     if (text.charAt(pos-1) != '/') {
182 
183                         // Search for the ending </script> tag
184 
185                         for (;;) {
186                             pos = text.indexOf("</", pos);
187 
188                             if (pos >= 0) {
189                                 if (isScriptTag(text, pos + 2)) {
190                                     y = pos;
191 
192                                     break;
193                                 }
194                                 else {
195 
196                                     // Skip past "</"
197 
198                                     pos += 2;
199                                 }
200                             }
201                             else {
202                                 break;
203                             }
204                         }
205                     }
206                 }
207             }
208 
209             x = text.indexOf(">", y);
210 
211             if (x == -1) {
212                 break;
213             }
214 
215             x++;
216 
217             if (x < y) {
218 
219                 // <b>Hello</b
220 
221                 break;
222             }
223 
224             y = text.indexOf("<", x);
225         }
226 
227         if (y == -1) {
228             sb.append(text.substring(x, text.length()));
229         }
230 
231         return sb.toString();
232     }
233 
234     public String toInputSafe(String text) {
235         return StringUtil.replace(
236             text,
237             new String[] {"&", "\""},
238             new String[] {"&amp;", "&quot;"});
239     }
240 
241     public String unescape(String text) {
242         if (text == null) {
243             return null;
244         }
245 
246         // Optimize this
247 
248         text = StringUtil.replace(text, "&lt;", "<");
249         text = StringUtil.replace(text, "&gt;", ">");
250         text = StringUtil.replace(text, "&amp;", "&");
251         text = StringUtil.replace(text, "&#034;", "\"");
252         text = StringUtil.replace(text, "&#039;", "'");
253         text = StringUtil.replace(text, "&#040;", "(");
254         text = StringUtil.replace(text, "&#041;", ")");
255         text = StringUtil.replace(text, "&#035;", "#");
256         text = StringUtil.replace(text, "&#037;", "%");
257         text = StringUtil.replace(text, "&#059;", ";");
258         text = StringUtil.replace(text, "&#043;", "+");
259         text = StringUtil.replace(text, "&#045;", "-");
260 
261         return text;
262     }
263 
264     protected boolean isScriptTag(String text, int pos) {
265         if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
266             char item;
267 
268             for (int i = 0; i < _TAG_SCRIPT.length; i++) {
269                 item = text.charAt(pos++);
270 
271                 if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
272                     return false;
273                 }
274             }
275 
276             item = text.charAt(pos);
277 
278             // Check that char after "script" is not a letter (i.e. another tag)
279 
280             return !Character.isLetter(item);
281         }
282         else {
283             return false;
284         }
285     }
286 
287     private static final String[] _MS_WORD_UNICODE = new String[] {
288         "\u00ae", "\u2019", "\u201c", "\u201d"
289     };
290 
291     private static final String[] _MS_WORD_HTML = new String[] {
292         "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
293     };
294 
295     private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
296 
297 }