1: <?php
2:
3: // why is this a top level function? Because PHP 5.2.0 doesn't seem to
4: // understand how to interpret this filter if it's a static method.
5: // It's all really silly, but if we go this route it might be reasonable
6: // to coalesce all of these methods into one.
7: function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {}
8:
9: /**
10: * This filter extracts <style> blocks from input HTML, cleans them up
11: * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
12: * so they can be used elsewhere in the document.
13: *
14: * @note
15: * See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for
16: * sample usage.
17: *
18: * @note
19: * This filter can also be used on stylesheets not included in the
20: * document--something purists would probably prefer. Just directly
21: * call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS()
22: */
23: class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
24: {
25:
26: public $name = 'ExtractStyleBlocks';
27: private $_styleMatches = array();
28: private $_tidy;
29:
30: private $_id_attrdef;
31: private $_class_attrdef;
32: private $_enum_attrdef;
33:
34: public function __construct() {
35: $this->_tidy = new csstidy();
36: $this->_tidy->set_cfg('lowercase_s', false);
37: $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
38: $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
39: $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
40: }
41:
42: /**
43: * Save the contents of CSS blocks to style matches
44: * @param $matches preg_replace style $matches array
45: */
46: protected function styleCallback($matches) {
47: $this->_styleMatches[] = $matches[1];
48: }
49:
50: /**
51: * Removes inline <style> tags from HTML, saves them for later use
52: * @todo Extend to indicate non-text/css style blocks
53: */
54: public function preFilter($html, $config, $context) {
55: $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
56: if ($tidy !== null) $this->_tidy = $tidy;
57: $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
58: $style_blocks = $this->_styleMatches;
59: $this->_styleMatches = array(); // reset
60: $context->register('StyleBlocks', $style_blocks); // $context must not be reused
61: if ($this->_tidy) {
62: foreach ($style_blocks as &$style) {
63: $style = $this->cleanCSS($style, $config, $context);
64: }
65: }
66: return $html;
67: }
68:
69: /**
70: * Takes CSS (the stuff found in <style>) and cleans it.
71: * @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
72: * @param $css CSS styling to clean
73: * @param $config Instance of HTMLPurifier_Config
74: * @param $context Instance of HTMLPurifier_Context
75: * @return Cleaned CSS
76: */
77: public function cleanCSS($css, $config, $context) {
78: // prepare scope
79: $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
80: if ($scope !== null) {
81: $scopes = array_map('trim', explode(',', $scope));
82: } else {
83: $scopes = array();
84: }
85: // remove comments from CSS
86: $css = trim($css);
87: if (strncmp('<!--', $css, 4) === 0) {
88: $css = substr($css, 4);
89: }
90: if (strlen($css) > 3 && substr($css, -3) == '-->') {
91: $css = substr($css, 0, -3);
92: }
93: $css = trim($css);
94: set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
95: $this->_tidy->parse($css);
96: restore_error_handler();
97: $css_definition = $config->getDefinition('CSS');
98: $html_definition = $config->getDefinition('HTML');
99: $new_css = array();
100: foreach ($this->_tidy->css as $k => $decls) {
101: // $decls are all CSS declarations inside an @ selector
102: $new_decls = array();
103: foreach ($decls as $selector => $style) {
104: $selector = trim($selector);
105: if ($selector === '') continue; // should not happen
106: // Parse the selector
107: // Here is the relevant part of the CSS grammar:
108: //
109: // ruleset
110: // : selector [ ',' S* selector ]* '{' ...
111: // selector
112: // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
113: // combinator
114: // : '+' S*
115: // : '>' S*
116: // simple_selector
117: // : element_name [ HASH | class | attrib | pseudo ]*
118: // | [ HASH | class | attrib | pseudo ]+
119: // element_name
120: // : IDENT | '*'
121: // ;
122: // class
123: // : '.' IDENT
124: // ;
125: // attrib
126: // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
127: // [ IDENT | STRING ] S* ]? ']'
128: // ;
129: // pseudo
130: // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
131: // ;
132: //
133: // For reference, here are the relevant tokens:
134: //
135: // HASH #{name}
136: // IDENT {ident}
137: // INCLUDES ==
138: // DASHMATCH |=
139: // STRING {string}
140: // FUNCTION {ident}\(
141: //
142: // And the lexical scanner tokens
143: //
144: // name {nmchar}+
145: // nmchar [_a-z0-9-]|{nonascii}|{escape}
146: // nonascii [\240-\377]
147: // escape {unicode}|\\[^\r\n\f0-9a-f]
148: // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
149: // ident -?{nmstart}{nmchar*}
150: // nmstart [_a-z]|{nonascii}|{escape}
151: // string {string1}|{string2}
152: // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
153: // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
154: //
155: // We'll implement a subset (in order to reduce attack
156: // surface); in particular:
157: //
158: // - No Unicode support
159: // - No escapes support
160: // - No string support (by proxy no attrib support)
161: // - element_name is matched against allowed
162: // elements (some people might find this
163: // annoying...)
164: // - Pseudo-elements one of :first-child, :link,
165: // :visited, :active, :hover, :focus
166:
167: // handle ruleset
168: $selectors = array_map('trim', explode(',', $selector));
169: $new_selectors = array();
170: foreach ($selectors as $sel) {
171: // split on +, > and spaces
172: $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
173: // even indices are chunks, odd indices are
174: // delimiters
175: $nsel = null;
176: $delim = null; // guaranteed to be non-null after
177: // two loop iterations
178: for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
179: $x = $basic_selectors[$i];
180: if ($i % 2) {
181: // delimiter
182: if ($x === ' ') {
183: $delim = ' ';
184: } else {
185: $delim = ' ' . $x . ' ';
186: }
187: } else {
188: // simple selector
189: $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
190: $sdelim = null;
191: $nx = null;
192: for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
193: $y = $components[$j];
194: if ($j === 0) {
195: if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
196: $nx = $y;
197: } else {
198: // $nx stays null; this matters
199: // if we don't manage to find
200: // any valid selector content,
201: // in which case we ignore the
202: // outer $delim
203: }
204: } elseif ($j % 2) {
205: // set delimiter
206: $sdelim = $y;
207: } else {
208: $attrdef = null;
209: if ($sdelim === '#') {
210: $attrdef = $this->_id_attrdef;
211: } elseif ($sdelim === '.') {
212: $attrdef = $this->_class_attrdef;
213: } elseif ($sdelim === ':') {
214: $attrdef = $this->_enum_attrdef;
215: } else {
216: throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
217: }
218: $r = $attrdef->validate($y, $config, $context);
219: if ($r !== false) {
220: if ($r !== true) {
221: $y = $r;
222: }
223: if ($nx === null) {
224: $nx = '';
225: }
226: $nx .= $sdelim . $y;
227: }
228: }
229: }
230: if ($nx !== null) {
231: if ($nsel === null) {
232: $nsel = $nx;
233: } else {
234: $nsel .= $delim . $nx;
235: }
236: } else {
237: // delimiters to the left of invalid
238: // basic selector ignored
239: }
240: }
241: }
242: if ($nsel !== null) {
243: if (!empty($scopes)) {
244: foreach ($scopes as $s) {
245: $new_selectors[] = "$s $nsel";
246: }
247: } else {
248: $new_selectors[] = $nsel;
249: }
250: }
251: }
252: if (empty($new_selectors)) continue;
253: $selector = implode(', ', $new_selectors);
254: foreach ($style as $name => $value) {
255: if (!isset($css_definition->info[$name])) {
256: unset($style[$name]);
257: continue;
258: }
259: $def = $css_definition->info[$name];
260: $ret = $def->validate($value, $config, $context);
261: if ($ret === false) unset($style[$name]);
262: else $style[$name] = $ret;
263: }
264: $new_decls[$selector] = $style;
265: }
266: $new_css[$k] = $new_decls;
267: }
268: // remove stuff that shouldn't be used, could be reenabled
269: // after security risks are analyzed
270: $this->_tidy->css = $new_css;
271: $this->_tidy->import = array();
272: $this->_tidy->charset = null;
273: $this->_tidy->namespace = null;
274: $css = $this->_tidy->print->plain();
275: // we are going to escape any special characters <>& to ensure
276: // that no funny business occurs (i.e. </style> in a font-family prop).
277: if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
278: $css = str_replace(
279: array('<', '>', '&'),
280: array('\3C ', '\3E ', '\26 '),
281: $css
282: );
283: }
284: return $css;
285: }
286:
287: }
288:
289: // vim: et sw=4 sts=4
290: