h2d_htmlconverter.php 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738
  1. <?php
  2. /**
  3. * HTMLtodocx
  4. * HTML to docx Converter
  5. * - HTML converter for use with PHPWord
  6. * Copyright (c) 2011 Commtap CIC
  7. *
  8. * Please see the accompanying license.txt file (in the same directory as this
  9. * file) for the licensing conditions for the use of this software.
  10. *
  11. */
  12. // Functions for converting and adding HTML into PHPWord objects
  13. // for creating a docx document.
  14. /**
  15. *
  16. * These are the elements which can be processed by this converter
  17. *
  18. * This will tell us when to stop when parsing HTML.
  19. * Anything still remaining after a stop (i.e. no more
  20. * parsable tags) to be returned as is (with any tags filtered out).
  21. *
  22. * @param string $tag
  23. * (optional) - the tag for the element for which
  24. * its possible children are required.
  25. * @return
  26. * array of allowed children
  27. */
  28. function htmltodocx_html_allowed_children($tag = NULL) {
  29. $allowed_children = array(
  30. 'body' => array('p', 'ul', 'ol', 'table', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'),
  31. 'h1' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  32. 'h2' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  33. 'h3' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  34. 'h4' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  35. 'h5' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  36. 'h6' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'),
  37. 'p' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'table', 'br', 'span', 'code', 'u', 'sup', 'text', 'div', 'p'), // p does not nest - simple_html_dom will create a flat set of paragraphs if it finds nested ones.
  38. 'div' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'table', 'br', 'span', 'code', 'u', 'sup', 'text', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'),
  39. 'a' => array('text'), // PHPWord doesn't allow elements to be placed in link elements
  40. 'em' => array('a', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), // Italic
  41. 'i' => array('a', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), // Italic
  42. 'strong' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'sup', 'text'), // Bold
  43. 'b' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'sup', 'text'), // Bold
  44. 'sup' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'text'), // Superscript
  45. 'u' => array('a', 'em', 'strong', 'b', 'i', 'br', 'span', 'code', 'sup', 'text'), // Underline - deprecated - but could be encountered.
  46. 'ul' => array('li'),
  47. 'ol' => array('li'),
  48. 'li' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text'),
  49. 'img' => array(),
  50. 'table' => array('tbody', 'tr'),
  51. 'tbody' => array('tr'),
  52. 'tr' => array('td', 'th'),
  53. 'td' => array('p', 'a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text', 'table'), // PHPWord does not allow you to insert a table into a table cell
  54. 'th' => array('p', 'a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text', 'table'), // PHPWord does not allow you to insert a table into a table cell
  55. 'br' => array(),
  56. 'code' => array(), // Note, elements nested inside the code element do not work! (Perhaps simpleHTMLDom isn't recognising them).
  57. 'span' => array('a', 'em', 'i', 'strong', 'b', 'img', 'br', 'span', 'code', 'sup', 'text'), // Used for styles - underline
  58. 'text' => array(), // The tag name used for elements containing just text in SimpleHtmlDom.
  59. );
  60. if (!$tag) {
  61. return $allowed_children;
  62. }
  63. elseif (isset($allowed_children[$tag])) {
  64. return $allowed_children[$tag];
  65. }
  66. else {
  67. return array();
  68. }
  69. }
  70. /**
  71. * Clean up text:
  72. *
  73. * @param string $text
  74. *
  75. */
  76. function htmltodocx_clean_text($text) {
  77. // Replace each &nbsp; with a single space:
  78. $text = str_replace('&nbsp;', ' ', $text);
  79. if (strpos($text, '<') !== FALSE) {
  80. // We only run strip_tags if it looks like there might be some tags in the text
  81. // as strip_tags is expensive:
  82. $text = strip_tags($text);
  83. }
  84. // Strip out extra spaces:
  85. $text = preg_replace('/\s+/u', ' ', $text);
  86. // Convert entities:
  87. $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
  88. return $text;
  89. }
  90. /**
  91. * Compute the styles that should be applied for the
  92. * current element.
  93. * We start with the default style, and successively override
  94. * this with the current style, style set for the tag, classes
  95. * and inline styles.
  96. *
  97. */
  98. function _htmltodocx_get_style($element, $state) {
  99. $style_sheet = $state['style_sheet'];
  100. // Get the default styles
  101. $phpword_style = $style_sheet['default'];
  102. // Update with the current style
  103. $current_style = $state['current_style'];
  104. // Remove uninheritable items:
  105. $inheritable_props = htmltodocx_inheritable_props();
  106. foreach ($current_style as $property => $value) {
  107. if (!in_array($property, $inheritable_props)) {
  108. unset($current_style[$property]);
  109. }
  110. }
  111. $phpword_style = array_merge($phpword_style, $current_style);
  112. // Update with any styles defined by the element tag
  113. $tag_style = isset($style_sheet['elements'][$element->tag]) ? $style_sheet['elements'][$element->tag] : array();
  114. $phpword_style = array_merge($phpword_style, $tag_style);
  115. // Find any classes defined for this element:
  116. $class_list = array();
  117. if (!empty($element->class)) {
  118. $classes = explode(' ', $element->class);
  119. foreach ($classes as $class) {
  120. $class_list[] = trim($class);
  121. }
  122. }
  123. // Look for any style definitions for these classes:
  124. $classes_style = array();
  125. if (!empty($class_list) && !empty($style_sheet['classes'])) {
  126. foreach ($style_sheet['classes'] as $class => $attributes) {
  127. if (in_array($class, $class_list)) {
  128. $classes_style = array_merge($classes_style, $attributes);
  129. }
  130. }
  131. }
  132. $phpword_style = array_merge($phpword_style, $classes_style);
  133. // Find any inline styles:
  134. $inline_style_list = array();
  135. if (!empty($element->attr['style'])) {
  136. $inline_styles = explode(';', rtrim(rtrim($element->attr['style']), ';'));
  137. foreach ($inline_styles as $inline_style) {
  138. $style_pair = explode(':', $inline_style);
  139. $inline_style_list[] = trim($style_pair[0]) . ': ' . trim($style_pair[1]);
  140. }
  141. }
  142. // Look for style definitions of these inline styles:
  143. $inline_styles = array();
  144. if (!empty($inline_style_list) && !empty($style_sheet['inline'])) {
  145. foreach ($style_sheet['inline'] as $inline_style => $attributes) {
  146. if (in_array($inline_style, $inline_style_list)) {
  147. $inline_styles = array_merge($inline_styles, $attributes);
  148. }
  149. }
  150. }
  151. $phpword_style = array_merge($phpword_style, $inline_styles);
  152. return $phpword_style;
  153. }
  154. /**
  155. * PHPWord style properties which are inheritable for the purposes of our
  156. * conversion:
  157. *
  158. */
  159. function htmltodocx_inheritable_props() {
  160. return array(
  161. 'size',
  162. 'name',
  163. 'bold',
  164. 'italic',
  165. 'superScript',
  166. 'subScript',
  167. 'underline',
  168. 'color',
  169. 'fgColor',
  170. 'align',
  171. 'spacing',
  172. 'listType',
  173. );
  174. }
  175. /**
  176. * Wrapper for htmltodocx_insert_html_recursive()
  177. * - inserts the initial defaults.
  178. *
  179. * @param $phpword_element
  180. * PHPWord object
  181. * @param mixed $html_dom_array
  182. * SimpleHTMLDom object
  183. * @param mixed $state
  184. * State
  185. */
  186. function htmltodocx_insert_html(&$phpword_element, $html_dom_array, &$state = array()) {
  187. // Set up initial defaults:
  188. // Lists:
  189. $state['pseudo_list'] = TRUE;
  190. // This converter only supports "pseudo" lists at present.
  191. $state['pseudo_list_indicator_font_name'] = isset($state['pseudo_list_indicator_font_name']) ? $state['pseudo_list_indicator_font_name'] : 'Wingdings'; // Bullet indicator font
  192. $state['pseudo_list_indicator_font_size'] = isset($state['pseudo_list_indicator_font_size']) ? $state['pseudo_list_indicator_font_size'] : '7'; // Bullet indicator size
  193. $state['pseudo_list_indicator_character'] = isset($state['pseudo_list_indicator_character']) ? $state['pseudo_list_indicator_character'] : 'l '; // Gives a circle bullet point with wingdings
  194. // "Style sheet":
  195. $state['style_sheet'] = isset($state['style_sheet']) ? $state['style_sheet'] : array();
  196. $state['style_sheet']['default'] = isset($state['style_sheet']['default']) ? $state['style_sheet']['default'] : array();
  197. // Current style:
  198. $state['current_style'] = isset($state['current_style']) ? $state['current_style'] : array('size' => '11');
  199. // Parents:
  200. $state['parents'] = isset($state['parents']) ? $state['parents'] : array(0 => 'body');
  201. $state['list_depth'] = isset($state['list_depth']) ? $state['list_depth'] : 0;
  202. $state['context'] = isset($state['context']) ? $state['context'] : 'section';
  203. // Possible values - section, footer or header.
  204. // Tables:
  205. if (in_array('td', $state['parents']) || in_array('th', $state['parents']) || (isset($state['table_allowed']) && !$state['table_allowed'])) {
  206. $state['table_allowed'] = FALSE;
  207. }
  208. else {
  209. $state['table_allowed'] = TRUE;
  210. }
  211. // Headings option:
  212. $state['structure_document'] = isset($state['structure_document']) ? $state['structure_document'] : FALSE;
  213. if ($state['structure_document']) {
  214. $state['structure_headings'] = array('h1' => 1, 'h2' => 2, 'h3' => 3, 'h4' => 4, 'h5' => 5, 'h6' => 6);
  215. }
  216. if (!$state['structure_document'] || !isset($state['table_of_contents_id'])) {
  217. $state['table_of_contents_id'] = FALSE;
  218. }
  219. // Treatment of divs:
  220. // The default is to treat a div like a paragraph - that is we insert a new
  221. // line each time we encounter a new div.
  222. $state['treat_div_as_paragraph'] = isset($state['treat_div_as_paragraph']) ? $state['treat_div_as_paragraph'] : TRUE;
  223. // Recurse through the HTML Dom inserting elements into the phpword object as
  224. // we go:
  225. htmltodocx_insert_html_recursive($phpword_element, $html_dom_array, $state);
  226. }
  227. /**
  228. * Populate PHPWord element
  229. * This recursive function processes all the elements and child elements
  230. * from the DOM array of objects created by SimpleHTMLDom.
  231. *
  232. * @param object phpword_element
  233. * PHPWord object to add in the converted html
  234. * @param array $html_dom_array
  235. * Array of nodes generated by simple HTML dom
  236. * @param array $state
  237. * Parameters for the current run
  238. */
  239. function htmltodocx_insert_html_recursive(&$phpword_element, $html_dom_array, &$state = array()) {
  240. // Go through the html_dom_array, adding bits to go in the PHPWord element
  241. $allowed_children = htmltodocx_html_allowed_children($state['parents'][0]);
  242. // Go through each element:
  243. foreach ($html_dom_array as $element) {
  244. $old_style = $state['current_style'];
  245. $state['current_style'] = _htmltodocx_get_style($element, $state);
  246. switch ($element->tag) {
  247. case 'p':
  248. case 'div': // Treat a div as a paragraph
  249. case 'h1':
  250. case 'h2':
  251. case 'h3':
  252. case 'h4':
  253. case 'h5':
  254. case 'h6':
  255. if ($state['structure_document'] && in_array($element->tag, array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && is_object($state['phpword_object'])) {
  256. // If the structure_document option has been enabled, then headings
  257. // are used to create Word heading styles. Note, in this case, any
  258. // nested elements within the heading are displayed as text only.
  259. // Additionally we don't now add a text break after a heading where
  260. // sizeAfter has not been set.
  261. $state['phpword_object']->addTitleStyle($state['structure_headings'][$element->tag], $state['current_style']);
  262. $phpword_element->addTitle(htmltodocx_clean_text($element->innertext), $state['structure_headings'][$element->tag]);
  263. break;
  264. }
  265. if ($element->tag == 'div' && $state['table_of_contents_id'] && $element->id == $state['table_of_contents_id']) {
  266. // Replace this div with a table of contents:
  267. $phpword_element->addTOC($state['current_style'], $state['current_style']);
  268. break;
  269. }
  270. // Everything in this element should be in the same text run
  271. // we need to initiate a text run here and pass it on. Starting one of
  272. // these elements will cause a new line to be added in the Word
  273. // document. In the case of divs this might not always be what is
  274. // wanted the setting 'treat_div_as_paragraph' determines whether or
  275. // not to add new lines for divs.
  276. if ($element->tag != 'div' || $state['treat_div_as_paragraph'] || !isset($state['textrun'])) {
  277. $state['textrun'] = $phpword_element->createTextRun($state['current_style']);
  278. }
  279. // For better usability for the end user of the Word document, we
  280. // separate paragraphs and headings with an empty line. You can
  281. // override this behaviour by setting the spaceAfter parameter for
  282. // the current element.
  283. // If the spaceAfter parameter is not set, we set it temporarily to 0
  284. // here and record that it wasn't set in the style. Later we will add
  285. // an empty line. Word 2007 and later have a non-zero default for
  286. // paragraph separation, so without setting that spacing to 0 here we
  287. // would end up with a large gap between paragraphs (the document
  288. // template default plus the extra line).
  289. $space_after_set = TRUE;
  290. if (!isset($state['current_style']['spaceAfter'])) {
  291. $state['current_style']['spaceAfter'] = 0;
  292. $space_after_set = FALSE;
  293. }
  294. if (in_array($element->tag, $allowed_children)) {
  295. array_unshift($state['parents'], $element->tag);
  296. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  297. array_shift($state['parents']);
  298. }
  299. else {
  300. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  301. }
  302. unset($state['textrun']);
  303. if (!$space_after_set) {
  304. // Add the text break here - where the spaceAfter parameter hadn't
  305. // been set initially - also unset the spaceAfter parameter we just
  306. // set:
  307. $phpword_element->addTextBreak();
  308. unset($state['current_style']['spaceAfter']);
  309. }
  310. break;
  311. case 'table':
  312. if (in_array('table', $allowed_children)) {
  313. $old_table_state = $state['table_allowed'];
  314. if (!$state['table_allowed'] || in_array('td', $state['parents']) || in_array('th', $state['parents'])) {
  315. $state['table_allowed'] = FALSE; // This is a PHPWord constraint
  316. }
  317. else {
  318. $state['table_allowed'] = TRUE;
  319. // PHPWord allows table_styles to be passed in a couple of
  320. // different ways either using an array of properties, or by
  321. // defining a full table style on the PHPWord object:
  322. if (is_object($state['phpword_object']) && method_exists($state['phpword_object'], 'addTableStyle')) {
  323. $state['phpword_object']->addTableStyle('temp_table_style', $state['current_style']);
  324. $table_style = 'temp_table_style';
  325. }
  326. else {
  327. $table_style = $state['current_style'];
  328. }
  329. $state['table'] = $phpword_element->addTable($table_style);
  330. }
  331. array_unshift($state['parents'], 'table');
  332. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  333. array_shift($state['parents']);
  334. // Reset table state to what it was before a table was added:
  335. $state['table_allowed'] = $old_table_state;
  336. $phpword_element->addTextBreak();
  337. }
  338. else {
  339. $state['textrun'] = $phpword_element->createTextRun();
  340. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  341. }
  342. break;
  343. case 'tbody':
  344. if (in_array('tbody', $allowed_children)) {
  345. array_unshift($state['parents'], 'tbody');
  346. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  347. array_shift($state['parents']);
  348. }
  349. else {
  350. $state['textrun'] = $phpword_element->createTextRun();
  351. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  352. }
  353. break;
  354. case 'tr':
  355. if (in_array('tr', $allowed_children)) {
  356. if ($state['table_allowed']) {
  357. $state['table']->addRow();
  358. }
  359. else {
  360. // Simply add a new line if a table is not possible in this
  361. // context:
  362. $state['textrun'] = $phpword_element->createTextRun();
  363. }
  364. array_unshift($state['parents'], 'tr');
  365. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  366. array_shift($state['parents']);
  367. }
  368. else {
  369. $state['textrun'] = $phpword_element->createTextRun();
  370. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  371. }
  372. break;
  373. case 'td':
  374. case 'th':
  375. if (in_array($element->tag, $allowed_children) && $state['table_allowed']) {
  376. unset($state['textrun']);
  377. if (isset($state['current_style']['width'])) {
  378. $cell_width = $state['current_style']['width'];
  379. }
  380. elseif (isset($element->width)) {
  381. $cell_width = $element->width * 15;
  382. // Converting at 15 TWIPS per pixel.
  383. }
  384. else {
  385. $cell_width = 800;
  386. }
  387. $state['table_cell'] = $state['table']->addCell($cell_width, $state['current_style']);
  388. array_unshift($state['parents'], $element->tag);
  389. htmltodocx_insert_html_recursive($state['table_cell'], $element->nodes, $state);
  390. array_shift($state['parents']);
  391. }
  392. else {
  393. if (!isset($state['textrun'])) {
  394. $state['textrun'] = $phpword_element->createTextRun();
  395. }
  396. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  397. }
  398. break;
  399. case 'a':
  400. // Create a new text run if we aren't in one already:
  401. if (!isset($state['textrun'])) {
  402. $state['textrun'] = $phpword_element->createTextRun();
  403. }
  404. if ($state['context'] == 'section') {
  405. if (strpos($element->href, 'http://') === 0) {
  406. $href = $element->href;
  407. }
  408. elseif (strpos($element->href, '/') === 0) {
  409. $href = $state['base_root'] . $element->href;
  410. }
  411. else {
  412. //$href = $state['base_root'] . $state['base_path'] . $element->href;
  413. $href = $element->href;
  414. }
  415. // Replace any spaces in url with %20 - to prevent errors in the Word
  416. // document:
  417. $state['textrun']->addLink(htmltodocx_url_encode_chars($href), htmltodocx_clean_text($element->innertext), $state['current_style']);
  418. }
  419. else {
  420. // Links can't seem to be included in headers or footers with
  421. // PHPWord: trying to include them causes an error which stops Word
  422. // from opening the file - in Word 2003 with the converter at least.
  423. // So add the link styled as a link only.
  424. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  425. }
  426. break;
  427. case 'ul':
  428. $state['list_total_count'] = count($element->children);
  429. // We use this to be able to add the ordered list spaceAfter onto the
  430. // last list element. All ol children should be li elements.
  431. _htmltodocx_add_list_start_end_spacing_style($state);
  432. $state['list_number'] = 0; // Reset list number.
  433. if (in_array('ul', $allowed_children)) {
  434. if (!isset($state['pseudo_list'])) {
  435. // Unset any existing text run:
  436. unset($state['textrun']);
  437. // PHPWord lists cannot appear in a text run. If we leave a text
  438. // run active then subsequent text will go in that text run (if it
  439. // isn't re-initialised), which would mean that text after this
  440. // list would appear before it in the Word document.
  441. }
  442. array_unshift($state['parents'], 'ul');
  443. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  444. array_shift($state['parents']);
  445. }
  446. else {
  447. $state['textrun'] = $phpword_element->createTextRun();
  448. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  449. }
  450. break;
  451. case 'ol':
  452. $state['list_total_count'] = count($element->children);
  453. // We use this to be able to add the ordered list spaceAfter onto the
  454. // last list element. All ol children should be li elements.
  455. _htmltodocx_add_list_start_end_spacing_style($state);
  456. $state['list_number'] = 0; // Reset list number.
  457. if (in_array('ol', $allowed_children)) {
  458. if (!isset($state['pseudo_list'])) {
  459. // Unset any existing text run:
  460. unset($state['textrun']);
  461. // Lists cannot appear in a text run. If we leave a text run active
  462. // then subsequent text will go in that text run (if it isn't
  463. // re-initialised), which would mean that text after this list
  464. // would appear before it in the Word document.
  465. }
  466. array_unshift($state['parents'], 'ol');
  467. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  468. array_shift($state['parents']);
  469. }
  470. else {
  471. $state['textrun'] = $phpword_element->createTextRun();
  472. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  473. }
  474. break;
  475. case 'li':
  476. // You cannot style individual pieces of text in a list element so we do it
  477. // with text runs instead. This does not allow us to indent lists at all, so
  478. // we can't show nesting.
  479. // Before and after spacings:
  480. if ($state['list_number'] === 0) {
  481. $state['current_style'] = array_merge($state['current_style'], $state['list_style_before']);
  482. }
  483. $last_item = FALSE;
  484. if ($state['list_number'] == $state['list_total_count'] - 1) {
  485. $last_item = TRUE;
  486. if (empty($state['list_style_after'])) {
  487. $state['current_style']['spaceAfter'] = 0;
  488. // Set to 0 if not defined so we can add a text break without
  489. // ending up within too much space in Word2007+.
  490. // *Needs further testing on Word 2007+*
  491. }
  492. $state['current_style'] = array_merge($state['current_style'], $state['list_style_after']);
  493. }
  494. // We create a new text run for each element:
  495. $state['textrun'] = $phpword_element->createTextRun($state['current_style']);
  496. if (in_array('li', $allowed_children)) {
  497. $state['list_number']++;
  498. if ($state['parents'][0] == 'ol') {
  499. $item_indicator = $state['list_number'] . '. ';
  500. $style = $state['current_style'];
  501. }
  502. else {
  503. $style = $state['current_style'];
  504. $style['name'] = $state['pseudo_list_indicator_font_name'];
  505. $style['size'] = $state['pseudo_list_indicator_font_size'];
  506. $item_indicator = $state['pseudo_list_indicator_character'];
  507. }
  508. array_unshift($state['parents'], 'li');
  509. $state['textrun']->addText($item_indicator, $style);
  510. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  511. array_shift($state['parents']);
  512. }
  513. else {
  514. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  515. }
  516. if ($last_item && empty($state['list_style_after'])) {
  517. $phpword_element->addTextBreak();
  518. // Add an empty line after the list if no spacing after has been
  519. // defined.
  520. }
  521. unset($state['textrun']);
  522. break;
  523. case 'text':
  524. // We may get some empty text nodes - containing just a space - in
  525. // simple HTML dom - we want to exclude those, as these can cause extra
  526. // line returns. However we don't want to exclude spaces between styling
  527. // elements (these will be within a text run).
  528. if (!isset($state['textrun'])) {
  529. $text = htmltodocx_clean_text(trim($element->innertext));
  530. }
  531. else {
  532. $text = htmltodocx_clean_text($element->innertext);
  533. }
  534. if (!empty($text)) {
  535. if (!isset($state['textrun'])) {
  536. $state['textrun'] = $phpword_element->createTextRun();
  537. }
  538. $state['textrun']->addText($text, $state['current_style']);
  539. }
  540. break;
  541. // Style tags:
  542. case 'strong':
  543. case 'b':
  544. case 'sup': // Not working in PHPWord
  545. case 'em':
  546. case 'i':
  547. case 'u':
  548. case 'span':
  549. case 'code':
  550. // Create a new text run if we aren't in one already:
  551. if (!isset($state['textrun'])) {
  552. $state['textrun'] = $phpword_element->createTextRun();
  553. }
  554. if (in_array($element->tag, $allowed_children)) {
  555. array_unshift($state['parents'], $element->tag);
  556. htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state);
  557. array_shift($state['parents']);
  558. }
  559. else {
  560. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  561. }
  562. break;
  563. // NB, Simple HTML Dom might not be picking up <br> tags.
  564. case 'br':
  565. // Simply create a new text run:
  566. $state['textrun'] = $phpword_element->createTextRun();
  567. break;
  568. case 'img':
  569. $image_style = array();
  570. if ($element->height && $element->width) {
  571. $state['current_style']['height'] = $element->height;
  572. $state['current_style']['width'] = $element->width;
  573. }
  574. if (strpos($element->src, $state['base_root']) === 0) {
  575. // The image source is a full url, but nevertheless it is on this
  576. // server.
  577. $element_src = substr($element->src, strlen($state['base_root']));
  578. }
  579. else {
  580. $element_src = $element->src;
  581. }
  582. if (strpos($element_src, 'http://') === 0) {
  583. // The image url is from another site. Most probably the image won't
  584. // appear in the Word document.
  585. //$src = $element_src;
  586. $tmp_path = $state['download_img_path'];
  587. $img_content = file_get_contents($element_src);
  588. $tmp_file = $tmp_path . basename($element_src);
  589. file_put_contents($tmp_file, $img_content);
  590. $state['download_img_tmp'][] = $tmp_file;
  591. $src = $tmp_file;
  592. }
  593. elseif (strpos($element_src, '/') === 0) {
  594. $src = htmltodocx_doc_root() . $element_src;
  595. }
  596. else {
  597. $src = htmltodocx_doc_root() . $state['base_path'] . $element_src;
  598. }
  599. $phpword_element->addImage($src, $state['current_style']);
  600. break;
  601. default:
  602. $state['textrun'] = $phpword_element->createTextRun();
  603. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']);
  604. break;
  605. }
  606. // Reset the style back to what it was:
  607. $state['current_style'] = $old_style;
  608. }
  609. }
  610. /**
  611. * Before/after styles for list elements - recorded
  612. * for use by the first or last item in a list.
  613. *
  614. */
  615. function _htmltodocx_add_list_start_end_spacing_style(&$state) {
  616. $state['list_style_after'] = isset($state['current_style']['spaceAfter']) ? array('spaceAfter' => $state['current_style']['spaceAfter']) : array();
  617. $state['list_style_before'] = isset($state['current_style']['spaceBefore']) ? array('spaceBefore' => $state['current_style']['spaceBefore']) : array();
  618. }
  619. /**
  620. * Get the document root.
  621. *
  622. */
  623. function htmltodocx_doc_root() {
  624. $local_path = getenv("SCRIPT_NAME");
  625. // Should be available on both Apache and non Apache servers.
  626. $local_dir = substr($local_path, 0, strrpos($local_path, '/'));
  627. if (empty($local_dir)) {
  628. return realpath('');
  629. }
  630. else {
  631. return substr(realpath(''), 0, -1 * strlen($local_dir));
  632. }
  633. }
  634. /**
  635. * Encodes selected characters in a url to prevent errors in the created Word
  636. * document. Note: if there is a space in the url and there isn't a forward
  637. * slash preceding it at some point, the resulting Word document will be
  638. * corrupted (even where the space has been urlencoded). We convert spaces to
  639. * %20 which stops this corruption in circumstances where a forward slash is
  640. * present.
  641. *
  642. */
  643. function htmltodocx_url_encode_chars($url) {
  644. // List the characters in this array to be encoded:
  645. $encode_chars = array(' ');
  646. foreach ($encode_chars as $char) {
  647. $encoded_chars[] = rawurlencode($char);
  648. }
  649. $encoded_url = str_replace($encode_chars, $encoded_chars, $url);
  650. return $encoded_url;
  651. }