ElggAutoP.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. <?php
  2. /**
  3. * Create wrapper P and BR elements in HTML depending on newlines. Useful when
  4. * users use newlines to signal line and paragraph breaks. In all cases output
  5. * should be well-formed markup.
  6. *
  7. * In DIV elements, Ps are only added when there would be at
  8. * least two of them.
  9. *
  10. * @package Elgg.Core
  11. * @subpackage Output
  12. */
  13. class ElggAutoP {
  14. public $encoding = 'UTF-8';
  15. /**
  16. * @var DOMDocument
  17. */
  18. protected $_doc = null;
  19. /**
  20. * @var DOMXPath
  21. */
  22. protected $_xpath = null;
  23. protected $_blocks = 'address article area aside blockquote caption col colgroup dd
  24. details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
  25. hr hgroup legend map math menu nav noscript p pre section select style summary
  26. table tbody td tfoot th thead tr ul ol option li';
  27. /**
  28. * @var array
  29. */
  30. protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
  31. del dfn em embed i iframe img input ins kbd keygen label map mark meter object
  32. output progress q rp rt ruby s samp script select small source span strong style
  33. sub sup textarea time var video wbr';
  34. /**
  35. * Descend into these elements to add Ps
  36. *
  37. * @var array
  38. */
  39. protected $_descendList = 'article aside blockquote body details div footer form
  40. header section';
  41. /**
  42. * Add Ps inside these elements
  43. *
  44. * @var array
  45. */
  46. protected $_alterList = 'article aside blockquote body details div footer header
  47. section';
  48. /** @var string */
  49. protected $_unique = '';
  50. /**
  51. * Constructor
  52. */
  53. public function __construct() {
  54. $this->_blocks = preg_split('@\\s+@', $this->_blocks);
  55. $this->_descendList = preg_split('@\\s+@', $this->_descendList);
  56. $this->_alterList = preg_split('@\\s+@', $this->_alterList);
  57. $this->_inlines = preg_split('@\\s+@', $this->_inlines);
  58. $this->_unique = md5(__FILE__);
  59. }
  60. /**
  61. * Create wrapper P and BR elements in HTML depending on newlines. Useful when
  62. * users use newlines to signal line and paragraph breaks. In all cases output
  63. * should be well-formed markup.
  64. *
  65. * In DIV, LI, TD, and TH elements, Ps are only added when their would be at
  66. * least two of them.
  67. *
  68. * @param string $html snippet
  69. * @return string|false output or false if parse error occurred
  70. */
  71. public function process($html) {
  72. // normalize whitespace
  73. $html = str_replace(array("\r\n", "\r"), "\n", $html);
  74. // allows preserving entities untouched
  75. $html = str_replace('&', $this->_unique . 'AMP', $html);
  76. $this->_doc = new DOMDocument();
  77. // parse to DOM, suppressing loadHTML warnings
  78. // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
  79. libxml_use_internal_errors(true);
  80. // Do not load entities. May be unnecessary, better safe than sorry
  81. $disable_load_entities = libxml_disable_entity_loader(true);
  82. if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
  83. . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
  84. . "</html>")) {
  85. libxml_disable_entity_loader($disable_load_entities);
  86. return false;
  87. }
  88. libxml_disable_entity_loader($disable_load_entities);
  89. $this->_xpath = new DOMXPath($this->_doc);
  90. // start processing recursively at the BODY element
  91. $nodeList = $this->_xpath->query('//body[1]');
  92. $this->addParagraphs($nodeList->item(0));
  93. // serialize back to HTML
  94. $html = $this->_doc->saveHTML();
  95. // Note: we create <autop> elements, which will later be converted to paragraphs
  96. // split AUTOPs into multiples at /\n\n+/
  97. $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
  98. $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),
  99. '<br />',
  100. $html);
  101. $html = str_replace('<br /></autop>', '</autop>', $html);
  102. // re-parse so we can handle new AUTOP elements
  103. // Do not load entities. May be unnecessary, better safe than sorry
  104. $disable_load_entities = libxml_disable_entity_loader(true);
  105. if (!$this->_doc->loadHTML($html)) {
  106. libxml_disable_entity_loader($disable_load_entities);
  107. return false;
  108. }
  109. libxml_disable_entity_loader($disable_load_entities);
  110. // must re-create XPath object after DOM load
  111. $this->_xpath = new DOMXPath($this->_doc);
  112. // strip AUTOPs that only have comments/whitespace
  113. foreach ($this->_xpath->query('//autop') as $autop) {
  114. /* @var DOMElement $autop */
  115. $hasContent = false;
  116. if (trim($autop->textContent) !== '') {
  117. $hasContent = true;
  118. } else {
  119. foreach ($autop->childNodes as $node) {
  120. if ($node->nodeType === XML_ELEMENT_NODE) {
  121. $hasContent = true;
  122. break;
  123. }
  124. }
  125. }
  126. if (!$hasContent) {
  127. // mark to be later replaced w/ preg_replace (faster than moving nodes out)
  128. $autop->setAttribute("r", "1");
  129. }
  130. }
  131. // If a DIV contains a single AUTOP, remove it
  132. foreach ($this->_xpath->query('//div') as $el) {
  133. /* @var DOMElement $el */
  134. $autops = $this->_xpath->query('./autop', $el);
  135. if ($autops->length === 1) {
  136. $firstAutop = $autops->item(0);
  137. /* @var DOMElement $firstAutop */
  138. $firstAutop->setAttribute("r", "1");
  139. }
  140. }
  141. $html = $this->_doc->saveHTML();
  142. // trim to the contents of BODY
  143. $bodyStart = strpos($html, '<body>');
  144. $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
  145. $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
  146. // strip AUTOPs that should be removed
  147. $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
  148. // commit to converting AUTOPs to Ps
  149. $html = str_replace('<autop>', "\n<p>", $html);
  150. $html = str_replace('</autop>', "</p>\n", $html);
  151. $html = str_replace('<br>', '<br />', $html);
  152. $html = str_replace($this->_unique . 'AMP', '&', $html);
  153. return $html;
  154. }
  155. /**
  156. * Add P and BR elements as necessary
  157. *
  158. * @param DOMElement $el DOM element
  159. * @return void
  160. */
  161. protected function addParagraphs(DOMElement $el) {
  162. // no need to call recursively, just queue up
  163. $elsToProcess = array($el);
  164. $inlinesToProcess = array();
  165. while ($el = array_shift($elsToProcess)) {
  166. // if true, we can alter all child nodes, if not, we'll just call
  167. // addParagraphs on each element in the descendInto list
  168. $alterInline = in_array($el->nodeName, $this->_alterList);
  169. // inside affected elements, we want to trim leading whitespace from
  170. // the first text node
  171. $ltrimFirstTextNode = true;
  172. // should we open a new AUTOP element to move inline elements into?
  173. $openP = true;
  174. $autop = null;
  175. // after BR, ignore a newline
  176. $isFollowingBr = false;
  177. $node = $el->firstChild;
  178. while (null !== $node) {
  179. if ($alterInline) {
  180. if ($openP) {
  181. $openP = false;
  182. // create a P to move inline content into (this may be removed later)
  183. $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
  184. }
  185. }
  186. $isElement = ($node->nodeType === XML_ELEMENT_NODE);
  187. if ($isElement) {
  188. $isBlock = in_array($node->nodeName, $this->_blocks);
  189. } else {
  190. $isBlock = false;
  191. }
  192. if ($alterInline) {
  193. $isText = ($node->nodeType === XML_TEXT_NODE);
  194. $isLastInline = (! $node->nextSibling
  195. || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
  196. && in_array($node->nextSibling->nodeName, $this->_blocks)));
  197. if ($isElement) {
  198. $isFollowingBr = ($node->nodeName === 'br');
  199. }
  200. if ($isText) {
  201. $nodeText = $node->nodeValue;
  202. if ($ltrimFirstTextNode) {
  203. $nodeText = ltrim($nodeText);
  204. $ltrimFirstTextNode = false;
  205. }
  206. if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
  207. // if a user ends a line with <br>, don't add a second BR
  208. $nodeText = substr($nodeText, strlen($m[0]));
  209. }
  210. if ($isLastInline) {
  211. $nodeText = rtrim($nodeText);
  212. }
  213. $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
  214. $tmpNode = $node;
  215. $node = $node->nextSibling; // move loop to next node
  216. // alter node in place, then move into AUTOP
  217. $tmpNode->nodeValue = $nodeText;
  218. $autop->appendChild($tmpNode);
  219. continue;
  220. }
  221. }
  222. if ($isBlock || ! $node->nextSibling) {
  223. if ($isBlock) {
  224. if (in_array($node->nodeName, $this->_descendList)) {
  225. $elsToProcess[] = $node;
  226. //$this->addParagraphs($node);
  227. }
  228. }
  229. $openP = true;
  230. $ltrimFirstTextNode = true;
  231. }
  232. if ($alterInline) {
  233. if (! $isBlock) {
  234. $tmpNode = $node;
  235. if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
  236. $inlinesToProcess[] = $tmpNode;
  237. }
  238. $node = $node->nextSibling;
  239. $autop->appendChild($tmpNode);
  240. continue;
  241. }
  242. }
  243. $node = $node->nextSibling;
  244. }
  245. }
  246. // handle inline nodes
  247. // no need to recurse, just queue up
  248. while ($el = array_shift($inlinesToProcess)) {
  249. $ignoreLeadingNewline = false;
  250. foreach ($el->childNodes as $node) {
  251. if ($node->nodeType === XML_ELEMENT_NODE) {
  252. if ($node->nodeValue === 'BR') {
  253. $ignoreLeadingNewline = true;
  254. } else {
  255. $ignoreLeadingNewline = false;
  256. if (false !== strpos($node->textContent, "\n")) {
  257. $inlinesToProcess[] = $node;
  258. }
  259. }
  260. continue;
  261. } elseif ($node->nodeType === XML_TEXT_NODE) {
  262. $text = $node->nodeValue;
  263. if ($text[0] === "\n" && $ignoreLeadingNewline) {
  264. $text = substr($text, 1);
  265. $ignoreLeadingNewline = false;
  266. }
  267. $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
  268. }
  269. }
  270. }
  271. }
  272. }