array('p', 'ul', 'ol', 'table', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'), 'h1' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'h2' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'h3' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'h4' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'h5' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'h6' => array('a', 'em', 'i', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), 'p' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'table', 'br', 'span', 'code', 'u', 'sup', 'text', 'div', 'p'), // p does not nest - simple_html_dom will create a flat set of paragraphs if it finds nested ones. 'div' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'table', 'br', 'span', 'code', 'u', 'sup', 'text', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'), 'a' => array('text'), // PHPWord doesn't allow elements to be placed in link elements 'em' => array('a', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), // Italic 'i' => array('a', 'strong', 'b', 'br', 'span', 'code', 'u', 'sup', 'text'), // Italic 'strong' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'sup', 'text'), // Bold 'b' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'sup', 'text'), // Bold 'sup' => array('a', 'em', 'i', 'br', 'span', 'code', 'u', 'text'), // Superscript 'u' => array('a', 'em', 'strong', 'b', 'i', 'br', 'span', 'code', 'sup', 'text'), // Underline - deprecated - but could be encountered. 'ul' => array('li'), 'ol' => array('li'), 'li' => array('a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text'), 'img' => array(), 'table' => array('tbody', 'tr'), 'tbody' => array('tr'), 'tr' => array('td', 'th'), 'td' => array('p', 'a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text', 'table'), // PHPWord does not allow you to insert a table into a table cell 'th' => array('p', 'a', 'em', 'i', 'strong', 'b', 'ul', 'ol', 'img', 'br', 'span', 'code', 'u', 'sup', 'text', 'table'), // PHPWord does not allow you to insert a table into a table cell 'br' => array(), 'code' => array(), // Note, elements nested inside the code element do not work! (Perhaps simpleHTMLDom isn't recognising them). 'span' => array('a', 'em', 'i', 'strong', 'b', 'img', 'br', 'span', 'code', 'sup', 'text'), // Used for styles - underline 'text' => array(), // The tag name used for elements containing just text in SimpleHtmlDom. ); if (!$tag) { return $allowed_children; } elseif (isset($allowed_children[$tag])) { return $allowed_children[$tag]; } else { return array(); } } /** * Clean up text: * * @param string $text * */ function htmltodocx_clean_text($text) { // Replace each   with a single space: $text = str_replace(' ', ' ', $text); if (strpos($text, '<') !== FALSE) { // We only run strip_tags if it looks like there might be some tags in the text // as strip_tags is expensive: $text = strip_tags($text); } // Strip out extra spaces: $text = preg_replace('/\s+/u', ' ', $text); // Convert entities: $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); return $text; } /** * Compute the styles that should be applied for the * current element. * We start with the default style, and successively override * this with the current style, style set for the tag, classes * and inline styles. * */ function _htmltodocx_get_style($element, $state) { $style_sheet = $state['style_sheet']; // Get the default styles $phpword_style = $style_sheet['default']; // Update with the current style $current_style = $state['current_style']; // Remove uninheritable items: $inheritable_props = htmltodocx_inheritable_props(); foreach ($current_style as $property => $value) { if (!in_array($property, $inheritable_props)) { unset($current_style[$property]); } } $phpword_style = array_merge($phpword_style, $current_style); // Update with any styles defined by the element tag $tag_style = isset($style_sheet['elements'][$element->tag]) ? $style_sheet['elements'][$element->tag] : array(); $phpword_style = array_merge($phpword_style, $tag_style); // Find any classes defined for this element: $class_list = array(); if (!empty($element->class)) { $classes = explode(' ', $element->class); foreach ($classes as $class) { $class_list[] = trim($class); } } // Look for any style definitions for these classes: $classes_style = array(); if (!empty($class_list) && !empty($style_sheet['classes'])) { foreach ($style_sheet['classes'] as $class => $attributes) { if (in_array($class, $class_list)) { $classes_style = array_merge($classes_style, $attributes); } } } $phpword_style = array_merge($phpword_style, $classes_style); // Find any inline styles: $inline_style_list = array(); if (!empty($element->attr['style'])) { $inline_styles = explode(';', rtrim(rtrim($element->attr['style']), ';')); foreach ($inline_styles as $inline_style) { $style_pair = explode(':', $inline_style); $inline_style_list[] = trim($style_pair[0]) . ': ' . trim($style_pair[1]); } } // Look for style definitions of these inline styles: $inline_styles = array(); if (!empty($inline_style_list) && !empty($style_sheet['inline'])) { foreach ($style_sheet['inline'] as $inline_style => $attributes) { if (in_array($inline_style, $inline_style_list)) { $inline_styles = array_merge($inline_styles, $attributes); } } } $phpword_style = array_merge($phpword_style, $inline_styles); return $phpword_style; } /** * PHPWord style properties which are inheritable for the purposes of our * conversion: * */ function htmltodocx_inheritable_props() { return array( 'size', 'name', 'bold', 'italic', 'superScript', 'subScript', 'underline', 'color', 'fgColor', 'align', 'spacing', 'listType', ); } /** * Wrapper for htmltodocx_insert_html_recursive() * - inserts the initial defaults. * * @param $phpword_element * PHPWord object * @param mixed $html_dom_array * SimpleHTMLDom object * @param mixed $state * State */ function htmltodocx_insert_html(&$phpword_element, $html_dom_array, &$state = array()) { // Set up initial defaults: // Lists: $state['pseudo_list'] = TRUE; // This converter only supports "pseudo" lists at present. $state['pseudo_list_indicator_font_name'] = isset($state['pseudo_list_indicator_font_name']) ? $state['pseudo_list_indicator_font_name'] : 'Wingdings'; // Bullet indicator font $state['pseudo_list_indicator_font_size'] = isset($state['pseudo_list_indicator_font_size']) ? $state['pseudo_list_indicator_font_size'] : '7'; // Bullet indicator size $state['pseudo_list_indicator_character'] = isset($state['pseudo_list_indicator_character']) ? $state['pseudo_list_indicator_character'] : 'l '; // Gives a circle bullet point with wingdings // "Style sheet": $state['style_sheet'] = isset($state['style_sheet']) ? $state['style_sheet'] : array(); $state['style_sheet']['default'] = isset($state['style_sheet']['default']) ? $state['style_sheet']['default'] : array(); // Current style: $state['current_style'] = isset($state['current_style']) ? $state['current_style'] : array('size' => '11'); // Parents: $state['parents'] = isset($state['parents']) ? $state['parents'] : array(0 => 'body'); $state['list_depth'] = isset($state['list_depth']) ? $state['list_depth'] : 0; $state['context'] = isset($state['context']) ? $state['context'] : 'section'; // Possible values - section, footer or header. // Tables: if (in_array('td', $state['parents']) || in_array('th', $state['parents']) || (isset($state['table_allowed']) && !$state['table_allowed'])) { $state['table_allowed'] = FALSE; } else { $state['table_allowed'] = TRUE; } // Headings option: $state['structure_document'] = isset($state['structure_document']) ? $state['structure_document'] : FALSE; if ($state['structure_document']) { $state['structure_headings'] = array('h1' => 1, 'h2' => 2, 'h3' => 3, 'h4' => 4, 'h5' => 5, 'h6' => 6); } if (!$state['structure_document'] || !isset($state['table_of_contents_id'])) { $state['table_of_contents_id'] = FALSE; } // Treatment of divs: // The default is to treat a div like a paragraph - that is we insert a new // line each time we encounter a new div. $state['treat_div_as_paragraph'] = isset($state['treat_div_as_paragraph']) ? $state['treat_div_as_paragraph'] : TRUE; // Recurse through the HTML Dom inserting elements into the phpword object as // we go: htmltodocx_insert_html_recursive($phpword_element, $html_dom_array, $state); } /** * Populate PHPWord element * This recursive function processes all the elements and child elements * from the DOM array of objects created by SimpleHTMLDom. * * @param object phpword_element * PHPWord object to add in the converted html * @param array $html_dom_array * Array of nodes generated by simple HTML dom * @param array $state * Parameters for the current run */ function htmltodocx_insert_html_recursive(&$phpword_element, $html_dom_array, &$state = array()) { // Go through the html_dom_array, adding bits to go in the PHPWord element $allowed_children = htmltodocx_html_allowed_children($state['parents'][0]); // Go through each element: foreach ($html_dom_array as $element) { $old_style = $state['current_style']; $state['current_style'] = _htmltodocx_get_style($element, $state); switch ($element->tag) { case 'p': case 'div': // Treat a div as a paragraph case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': if ($state['structure_document'] && in_array($element->tag, array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) && is_object($state['phpword_object'])) { // If the structure_document option has been enabled, then headings // are used to create Word heading styles. Note, in this case, any // nested elements within the heading are displayed as text only. // Additionally we don't now add a text break after a heading where // sizeAfter has not been set. $state['phpword_object']->addTitleStyle($state['structure_headings'][$element->tag], $state['current_style']); $phpword_element->addTitle(htmltodocx_clean_text($element->innertext), $state['structure_headings'][$element->tag]); break; } if ($element->tag == 'div' && $state['table_of_contents_id'] && $element->id == $state['table_of_contents_id']) { // Replace this div with a table of contents: $phpword_element->addTOC($state['current_style'], $state['current_style']); break; } // Everything in this element should be in the same text run // we need to initiate a text run here and pass it on. Starting one of // these elements will cause a new line to be added in the Word // document. In the case of divs this might not always be what is // wanted the setting 'treat_div_as_paragraph' determines whether or // not to add new lines for divs. if ($element->tag != 'div' || $state['treat_div_as_paragraph'] || !isset($state['textrun'])) { $state['textrun'] = $phpword_element->createTextRun($state['current_style']); } // For better usability for the end user of the Word document, we // separate paragraphs and headings with an empty line. You can // override this behaviour by setting the spaceAfter parameter for // the current element. // If the spaceAfter parameter is not set, we set it temporarily to 0 // here and record that it wasn't set in the style. Later we will add // an empty line. Word 2007 and later have a non-zero default for // paragraph separation, so without setting that spacing to 0 here we // would end up with a large gap between paragraphs (the document // template default plus the extra line). $space_after_set = TRUE; if (!isset($state['current_style']['spaceAfter'])) { $state['current_style']['spaceAfter'] = 0; $space_after_set = FALSE; } if (in_array($element->tag, $allowed_children)) { array_unshift($state['parents'], $element->tag); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } unset($state['textrun']); if (!$space_after_set) { // Add the text break here - where the spaceAfter parameter hadn't // been set initially - also unset the spaceAfter parameter we just // set: $phpword_element->addTextBreak(); unset($state['current_style']['spaceAfter']); } break; case 'table': if (in_array('table', $allowed_children)) { $old_table_state = $state['table_allowed']; if (!$state['table_allowed'] || in_array('td', $state['parents']) || in_array('th', $state['parents'])) { $state['table_allowed'] = FALSE; // This is a PHPWord constraint } else { $state['table_allowed'] = TRUE; // PHPWord allows table_styles to be passed in a couple of // different ways either using an array of properties, or by // defining a full table style on the PHPWord object: if (is_object($state['phpword_object']) && method_exists($state['phpword_object'], 'addTableStyle')) { $state['phpword_object']->addTableStyle('temp_table_style', $state['current_style']); $table_style = 'temp_table_style'; } else { $table_style = $state['current_style']; } $state['table'] = $phpword_element->addTable($table_style); } array_unshift($state['parents'], 'table'); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); // Reset table state to what it was before a table was added: $state['table_allowed'] = $old_table_state; $phpword_element->addTextBreak(); } else { $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'tbody': if (in_array('tbody', $allowed_children)) { array_unshift($state['parents'], 'tbody'); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'tr': if (in_array('tr', $allowed_children)) { if ($state['table_allowed']) { $state['table']->addRow(); } else { // Simply add a new line if a table is not possible in this // context: $state['textrun'] = $phpword_element->createTextRun(); } array_unshift($state['parents'], 'tr'); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'td': case 'th': if (in_array($element->tag, $allowed_children) && $state['table_allowed']) { unset($state['textrun']); if (isset($state['current_style']['width'])) { $cell_width = $state['current_style']['width']; } elseif (isset($element->width)) { $cell_width = $element->width * 15; // Converting at 15 TWIPS per pixel. } else { $cell_width = 800; } $state['table_cell'] = $state['table']->addCell($cell_width, $state['current_style']); array_unshift($state['parents'], $element->tag); htmltodocx_insert_html_recursive($state['table_cell'], $element->nodes, $state); array_shift($state['parents']); } else { if (!isset($state['textrun'])) { $state['textrun'] = $phpword_element->createTextRun(); } $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'a': // Create a new text run if we aren't in one already: if (!isset($state['textrun'])) { $state['textrun'] = $phpword_element->createTextRun(); } if ($state['context'] == 'section') { if (strpos($element->href, 'http://') === 0) { $href = $element->href; } elseif (strpos($element->href, '/') === 0) { $href = $state['base_root'] . $element->href; } else { //$href = $state['base_root'] . $state['base_path'] . $element->href; $href = $element->href; } // Replace any spaces in url with %20 - to prevent errors in the Word // document: $state['textrun']->addLink(htmltodocx_url_encode_chars($href), htmltodocx_clean_text($element->innertext), $state['current_style']); } else { // Links can't seem to be included in headers or footers with // PHPWord: trying to include them causes an error which stops Word // from opening the file - in Word 2003 with the converter at least. // So add the link styled as a link only. $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'ul': $state['list_total_count'] = count($element->children); // We use this to be able to add the ordered list spaceAfter onto the // last list element. All ol children should be li elements. _htmltodocx_add_list_start_end_spacing_style($state); $state['list_number'] = 0; // Reset list number. if (in_array('ul', $allowed_children)) { if (!isset($state['pseudo_list'])) { // Unset any existing text run: unset($state['textrun']); // PHPWord lists cannot appear in a text run. If we leave a text // run active then subsequent text will go in that text run (if it // isn't re-initialised), which would mean that text after this // list would appear before it in the Word document. } array_unshift($state['parents'], 'ul'); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'ol': $state['list_total_count'] = count($element->children); // We use this to be able to add the ordered list spaceAfter onto the // last list element. All ol children should be li elements. _htmltodocx_add_list_start_end_spacing_style($state); $state['list_number'] = 0; // Reset list number. if (in_array('ol', $allowed_children)) { if (!isset($state['pseudo_list'])) { // Unset any existing text run: unset($state['textrun']); // Lists cannot appear in a text run. If we leave a text run active // then subsequent text will go in that text run (if it isn't // re-initialised), which would mean that text after this list // would appear before it in the Word document. } array_unshift($state['parents'], 'ol'); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; case 'li': // You cannot style individual pieces of text in a list element so we do it // with text runs instead. This does not allow us to indent lists at all, so // we can't show nesting. // Before and after spacings: if ($state['list_number'] === 0) { $state['current_style'] = array_merge($state['current_style'], $state['list_style_before']); } $last_item = FALSE; if ($state['list_number'] == $state['list_total_count'] - 1) { $last_item = TRUE; if (empty($state['list_style_after'])) { $state['current_style']['spaceAfter'] = 0; // Set to 0 if not defined so we can add a text break without // ending up within too much space in Word2007+. // *Needs further testing on Word 2007+* } $state['current_style'] = array_merge($state['current_style'], $state['list_style_after']); } // We create a new text run for each element: $state['textrun'] = $phpword_element->createTextRun($state['current_style']); if (in_array('li', $allowed_children)) { $state['list_number']++; if ($state['parents'][0] == 'ol') { $item_indicator = $state['list_number'] . '. '; $style = $state['current_style']; } else { $style = $state['current_style']; $style['name'] = $state['pseudo_list_indicator_font_name']; $style['size'] = $state['pseudo_list_indicator_font_size']; $item_indicator = $state['pseudo_list_indicator_character']; } array_unshift($state['parents'], 'li'); $state['textrun']->addText($item_indicator, $style); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } if ($last_item && empty($state['list_style_after'])) { $phpword_element->addTextBreak(); // Add an empty line after the list if no spacing after has been // defined. } unset($state['textrun']); break; case 'text': // We may get some empty text nodes - containing just a space - in // simple HTML dom - we want to exclude those, as these can cause extra // line returns. However we don't want to exclude spaces between styling // elements (these will be within a text run). if (!isset($state['textrun'])) { $text = htmltodocx_clean_text(trim($element->innertext)); } else { $text = htmltodocx_clean_text($element->innertext); } if (!empty($text)) { if (!isset($state['textrun'])) { $state['textrun'] = $phpword_element->createTextRun(); } $state['textrun']->addText($text, $state['current_style']); } break; // Style tags: case 'strong': case 'b': case 'sup': // Not working in PHPWord case 'em': case 'i': case 'u': case 'span': case 'code': // Create a new text run if we aren't in one already: if (!isset($state['textrun'])) { $state['textrun'] = $phpword_element->createTextRun(); } if (in_array($element->tag, $allowed_children)) { array_unshift($state['parents'], $element->tag); htmltodocx_insert_html_recursive($phpword_element, $element->nodes, $state); array_shift($state['parents']); } else { $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); } break; // NB, Simple HTML Dom might not be picking up
tags. case 'br': // Simply create a new text run: $state['textrun'] = $phpword_element->createTextRun(); break; case 'img': $image_style = array(); if ($element->height && $element->width) { $state['current_style']['height'] = $element->height; $state['current_style']['width'] = $element->width; } if (strpos($element->src, $state['base_root']) === 0) { // The image source is a full url, but nevertheless it is on this // server. $element_src = substr($element->src, strlen($state['base_root'])); } else { $element_src = $element->src; } if (strpos($element_src, 'http://') === 0) { // The image url is from another site. Most probably the image won't // appear in the Word document. //$src = $element_src; $tmp_path = $state['download_img_path']; $img_content = file_get_contents($element_src); $tmp_file = $tmp_path . basename($element_src); file_put_contents($tmp_file, $img_content); $state['download_img_tmp'][] = $tmp_file; $src = $tmp_file; } elseif (strpos($element_src, '/') === 0) { $src = htmltodocx_doc_root() . $element_src; } else { $src = htmltodocx_doc_root() . $state['base_path'] . $element_src; } $phpword_element->addImage($src, $state['current_style']); break; default: $state['textrun'] = $phpword_element->createTextRun(); $state['textrun']->addText(htmltodocx_clean_text($element->innertext), $state['current_style']); break; } // Reset the style back to what it was: $state['current_style'] = $old_style; } } /** * Before/after styles for list elements - recorded * for use by the first or last item in a list. * */ function _htmltodocx_add_list_start_end_spacing_style(&$state) { $state['list_style_after'] = isset($state['current_style']['spaceAfter']) ? array('spaceAfter' => $state['current_style']['spaceAfter']) : array(); $state['list_style_before'] = isset($state['current_style']['spaceBefore']) ? array('spaceBefore' => $state['current_style']['spaceBefore']) : array(); } /** * Get the document root. * */ function htmltodocx_doc_root() { $local_path = getenv("SCRIPT_NAME"); // Should be available on both Apache and non Apache servers. $local_dir = substr($local_path, 0, strrpos($local_path, '/')); if (empty($local_dir)) { return realpath(''); } else { return substr(realpath(''), 0, -1 * strlen($local_dir)); } } /** * Encodes selected characters in a url to prevent errors in the created Word * document. Note: if there is a space in the url and there isn't a forward * slash preceding it at some point, the resulting Word document will be * corrupted (even where the space has been urlencoded). We convert spaces to * %20 which stops this corruption in circumstances where a forward slash is * present. * */ function htmltodocx_url_encode_chars($url) { // List the characters in this array to be encoded: $encode_chars = array(' '); foreach ($encode_chars as $char) { $encoded_chars[] = rawurlencode($char); } $encoded_url = str_replace($encode_chars, $encoded_chars, $url); return $encoded_url; }