_blocks = preg_split('@\\s+@', $this->_blocks); $this->_descendList = preg_split('@\\s+@', $this->_descendList); $this->_alterList = preg_split('@\\s+@', $this->_alterList); $this->_inlines = preg_split('@\\s+@', $this->_inlines); $this->_unique = md5(__FILE__); } /** * Create wrapper P and BR elements in HTML depending on newlines. Useful when * users use newlines to signal line and paragraph breaks. In all cases output * should be well-formed markup. * * In DIV, LI, TD, and TH elements, Ps are only added when their would be at * least two of them. * * @param string $html snippet * @return string|false output or false if parse error occurred */ public function process($html) { // normalize whitespace $html = str_replace(array("\r\n", "\r"), "\n", $html); // allows preserving entities untouched $html = str_replace('&', $this->_unique . 'AMP', $html); $this->_doc = new DOMDocument(); // parse to DOM, suppressing loadHTML warnings // http://www.php.net/manual/en/domdocument.loadhtml.php#95463 libxml_use_internal_errors(true); // Do not load entities. May be unnecessary, better safe than sorry $disable_load_entities = libxml_disable_entity_loader(true); if (!$this->_doc->loadHTML("{$html}" . "")) { libxml_disable_entity_loader($disable_load_entities); return false; } libxml_disable_entity_loader($disable_load_entities); $this->_xpath = new DOMXPath($this->_doc); // start processing recursively at the BODY element $nodeList = $this->_xpath->query('//body[1]'); $this->addParagraphs($nodeList->item(0)); // serialize back to HTML $html = $this->_doc->saveHTML(); // Note: we create elements, which will later be converted to paragraphs // split AUTOPs into multiples at /\n\n+/ $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '', $html); $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '
'), '
', $html); $html = str_replace('
', '', $html); // re-parse so we can handle new AUTOP elements // Do not load entities. May be unnecessary, better safe than sorry $disable_load_entities = libxml_disable_entity_loader(true); if (!$this->_doc->loadHTML($html)) { libxml_disable_entity_loader($disable_load_entities); return false; } libxml_disable_entity_loader($disable_load_entities); // must re-create XPath object after DOM load $this->_xpath = new DOMXPath($this->_doc); // strip AUTOPs that only have comments/whitespace foreach ($this->_xpath->query('//autop') as $autop) { /* @var DOMElement $autop */ $hasContent = false; if (trim($autop->textContent) !== '') { $hasContent = true; } else { foreach ($autop->childNodes as $node) { if ($node->nodeType === XML_ELEMENT_NODE) { $hasContent = true; break; } } } if (!$hasContent) { // mark to be later replaced w/ preg_replace (faster than moving nodes out) $autop->setAttribute("r", "1"); } } // If a DIV contains a single AUTOP, remove it foreach ($this->_xpath->query('//div') as $el) { /* @var DOMElement $el */ $autops = $this->_xpath->query('./autop', $el); if ($autops->length === 1) { $firstAutop = $autops->item(0); /* @var DOMElement $firstAutop */ $firstAutop->setAttribute("r", "1"); } } $html = $this->_doc->saveHTML(); // trim to the contents of BODY $bodyStart = strpos($html, ''); $bodyEnd = strpos($html, '', $bodyStart + 6); $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6); // strip AUTOPs that should be removed $html = preg_replace('@(.*?)@', '\\1', $html); // commit to converting AUTOPs to Ps $html = str_replace('', "\n

", $html); $html = str_replace('', "

\n", $html); $html = str_replace('
', '
', $html); $html = str_replace($this->_unique . 'AMP', '&', $html); return $html; } /** * Add P and BR elements as necessary * * @param DOMElement $el DOM element * @return void */ protected function addParagraphs(DOMElement $el) { // no need to call recursively, just queue up $elsToProcess = array($el); $inlinesToProcess = array(); while ($el = array_shift($elsToProcess)) { // if true, we can alter all child nodes, if not, we'll just call // addParagraphs on each element in the descendInto list $alterInline = in_array($el->nodeName, $this->_alterList); // inside affected elements, we want to trim leading whitespace from // the first text node $ltrimFirstTextNode = true; // should we open a new AUTOP element to move inline elements into? $openP = true; $autop = null; // after BR, ignore a newline $isFollowingBr = false; $node = $el->firstChild; while (null !== $node) { if ($alterInline) { if ($openP) { $openP = false; // create a P to move inline content into (this may be removed later) $autop = $el->insertBefore($this->_doc->createElement('autop'), $node); } } $isElement = ($node->nodeType === XML_ELEMENT_NODE); if ($isElement) { $isBlock = in_array($node->nodeName, $this->_blocks); } else { $isBlock = false; } if ($alterInline) { $isText = ($node->nodeType === XML_TEXT_NODE); $isLastInline = (! $node->nextSibling || ($node->nextSibling->nodeType === XML_ELEMENT_NODE && in_array($node->nextSibling->nodeName, $this->_blocks))); if ($isElement) { $isFollowingBr = ($node->nodeName === 'br'); } if ($isText) { $nodeText = $node->nodeValue; if ($ltrimFirstTextNode) { $nodeText = ltrim($nodeText); $ltrimFirstTextNode = false; } if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) { // if a user ends a line with
, don't add a second BR $nodeText = substr($nodeText, strlen($m[0])); } if ($isLastInline) { $nodeText = rtrim($nodeText); } $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText); $tmpNode = $node; $node = $node->nextSibling; // move loop to next node // alter node in place, then move into AUTOP $tmpNode->nodeValue = $nodeText; $autop->appendChild($tmpNode); continue; } } if ($isBlock || ! $node->nextSibling) { if ($isBlock) { if (in_array($node->nodeName, $this->_descendList)) { $elsToProcess[] = $node; //$this->addParagraphs($node); } } $openP = true; $ltrimFirstTextNode = true; } if ($alterInline) { if (! $isBlock) { $tmpNode = $node; if ($isElement && false !== strpos($tmpNode->textContent, "\n")) { $inlinesToProcess[] = $tmpNode; } $node = $node->nextSibling; $autop->appendChild($tmpNode); continue; } } $node = $node->nextSibling; } } // handle inline nodes // no need to recurse, just queue up while ($el = array_shift($inlinesToProcess)) { $ignoreLeadingNewline = false; foreach ($el->childNodes as $node) { if ($node->nodeType === XML_ELEMENT_NODE) { if ($node->nodeValue === 'BR') { $ignoreLeadingNewline = true; } else { $ignoreLeadingNewline = false; if (false !== strpos($node->textContent, "\n")) { $inlinesToProcess[] = $node; } } continue; } elseif ($node->nodeType === XML_TEXT_NODE) { $text = $node->nodeValue; if ($text[0] === "\n" && $ignoreLeadingNewline) { $text = substr($text, 1); $ignoreLeadingNewline = false; } $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text); } } } } }