From 1bb7907cd98dbf00bc62ed642ec01b16d8f9f71f Mon Sep 17 00:00:00 2001 From: LMR <59361885@qq.com> Date: Thu, 3 Jan 2019 14:17:15 +0800 Subject: [PATCH] add word php --- application/libraries/wordphp.php | 544 ++++++++++++++++++++++++++++++ 1 file changed, 544 insertions(+) create mode 100644 application/libraries/wordphp.php diff --git a/application/libraries/wordphp.php b/application/libraries/wordphp.php new file mode 100644 index 00000000..71f6a497 --- /dev/null +++ b/application/libraries/wordphp.php @@ -0,0 +1,544 @@ +. +// +// ---------------------------------------------------------------------------- +// +// Description : PHP class to read DOCX file into HTML format +// +// Author: Ricardo Pinto +// +// (c) Copyright: +// Ricardo Pinto +//============================================================+ + +class WordPHP +{ + private $debug = false; + private $file; + private $rels_xml; + private $doc_xml; + private $doc_media = []; + private $last = 'none'; + private $encoding = 'ISO-8859-1'; + private $tmpDir = 'tmp'; + + /** + * CONSTRUCTOR + * + * @param Boolean $debug Debug mode or not + * @return void + */ + public function __construct($debug_=null, $encoding=null) + { + if($debug_ != null) { + $this->debug = $debug_; + } + if ($encoding != null) { + $this->encoding = $encoding; + } + $this->tmpDir = dirname(__FILE__); + } + + /** + * Sets the tmp directory where images will be stored + * + * @param string $tmp The location + * @return void + */ + private function setTmpDir($tmp) + { + $this->tmpDir = $tmp; + } + + /** + * READS The Document and Relationships into separated XML files + * + * @param var $object The class variable to set as DOMDocument + * @param var $xml The xml file + * @param string $encoding The encoding to be used + * @return void + */ + private function setXmlParts(&$object, $xml, $encoding) + { + $object = new DOMDocument(); + $object->encoding = $encoding; + $object->preserveWhiteSpace = false; + $object->formatOutput = true; + $object->loadXML($xml); + $object->saveXML(); + } + + /** + * READS The Document and Relationships into separated XML files + * + * @param String $filename The filename + * @return void + */ + private function readZipPart($filename) + { + $zip = new ZipArchive(); + $_xml = 'word/document.xml'; + $_xml_rels = 'word/_rels/document.xml.rels'; + + if (true === $zip->open($filename)) { + if (($index = $zip->locateName($_xml)) !== false) { + $xml = $zip->getFromIndex($index); + } + //Get the relationships + if (($index = $zip->locateName($_xml_rels)) !== false) { + $xml_rels = $zip->getFromIndex($index); + } + // load all images if they exist + for ($i=0; $i<$zip->numFiles;$i++) { + $zip_element = $zip->statIndex($i); + if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zip_element['name'])) { + $this->doc_media[$zip_element['name']] = $zip_element['name']; + } + } + $zip->close(); + } else die('non zip file'); + + $enc = mb_detect_encoding($xml); + $this->setXmlParts($this->doc_xml, $xml, $enc); + $this->setXmlParts($this->rels_xml, $xml_rels, $enc); + + if($this->debug) { + echo ""; + echo ""; + } + } + + /** + * CHECKS THE FONT FORMATTING OF A GIVEN ELEMENT + * Currently checks and formats: bold, italic, underline, background color and font family + * + * @param XML $xml The XML node + * @return String HTML formatted code + */ + private function checkFormating(&$xml) + { + $node = trim($xml->readOuterXML()); + $t = ''; + // add
tags + if (strstr($node,'XML($node); + $img = null; + + while ($reader->read()) { + if($reader->name == "w:b") { + $f .= "font-weight: bold,"; + } + if($reader->name == "w:i") { + $f .= "text-decoration: underline,"; + } + if($reader->name == "w:color") { + $f .="color: #".$reader->getAttribute("w:val").","; + } + if($reader->name == "w:rFont") { + $f .="font-family: #".$reader->getAttribute("w:ascii").","; + } + if($reader->name == "w:shd" && $reader->getAttribute("w:val") != "clear" && $reader->getAttribute("w:fill") != "000000") { + $f .="background-color: #".$reader->getAttribute("w:fill").","; + } + if($reader->name == 'w:drawing' && !empty($reader->readInnerXml())) { + $r = $this->checkImageFormating($reader); + $img = $r !== null ? "" : null; + } + } + + $f = rtrim($f, ','); + $f .= "'>"; + $t .= ($img !== null ? $img : htmlentities($xml->expand()->textContent)); + + return $f.$t.""; + } + + /** + * CHECKS THE ELEMENT FOR UL ELEMENTS + * Currently under development + * + * @param XML $xml The XML node + * @return String HTML formatted code + */ + private function getListFormating(&$xml) + { + $node = trim($xml->readOuterXML()); + + $reader = new XMLReader(); + $reader->XML($node); + $ret=""; + $close = ""; + while ($reader->read()){ + if($reader->name == "w:numPr" && $reader->nodeType == XMLReader::ELEMENT ) { + + } + if($reader->name == "w:numId" && $reader->hasAttributes) { + switch($reader->getAttribute("w:val")) { + case 1: + $ret['open'] = "
  1. "; + $ret['close'] = "
"; + break; + case 2: + $ret['open'] = ""; + break; + } + + } + } + return $ret; + } + + /** + * CHECKS IF THERE IS AN IMAGE PRESENT + * Currently under development + * + * @param XML $xml The XML node + * @return String The location of the image + */ + private function checkImageFormating(&$xml) + { + $content = trim($xml->readInnerXml()); + + if (!empty($content)) { + + $relId; + $notfound = true; + $reader = new XMLReader(); + $reader->XML($content); + + while ($reader->read() && $notfound) { + if ($reader->name == "a:blip") { + $relId = $reader->getAttribute("r:embed"); + $notfound = false; + } + } + + // image id found, get the image location + if (!$notfound && $relId) { + $reader = new XMLReader(); + $reader->XML($this->rels_xml->saveXML()); + + while ($reader->read()) { + if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') { + if($reader->getAttribute("Id") == $relId) { + $link = "word/".$reader->getAttribute('Target'); + break; + } + } + } + + $zip = new ZipArchive(); + $im = null; + if (true === $zip->open($this->file)) { + $im = $this->createImage($zip->getFromName($link), $relId, $link); + } + $zip->close(); + return $im; + } + } + + return null; + } + + /** + * Creates an image in the filesystem + * + * @param objetc $image The image object + * @param string $relId The image relationship Id + * @param string $name The image name + * @return Array With HTML open and closing tag definition + */ + private function createImage($image, $relId, $name) + { + $arr = explode('.', $name); + $l = count($arr); + $ext = strtolower($arr[$l-1]); + + $im = imagecreatefromstring($image); + $fname = $this->tmpDir.'/tmp/'.$relId.'.'.$ext; + + switch ($ext) { + case 'png': + imagepng($im, $fname); + break; + case 'bmp': + imagebmp($im, $fname); + break; + case 'gif': + imagegif($im, $fname); + break; + case 'jpeg': + case 'jpg': + imagejpeg($im, $fname); + break; + default: + return null; + } + + return $fname; + } + + /** + * CHECKS IF ELEMENT IS AN HYPERLINK + * + * @param XML $xml The XML node + * @return Array With HTML open and closing tag definition + */ + private function getHyperlink(&$xml) + { + $ret = array('open'=>''); + $link =''; + if($xml->hasAttributes) { + $attribute = ""; + while($xml->moveToNextAttribute()) { + if($xml->name == "r:id") + $attribute = $xml->value; + } + + if($attribute != "") { + $reader = new XMLReader(); + $reader->XML($this->rels_xml->saveXML()); + + while ($reader->read()) { + if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') { + if($reader->getAttribute("Id") == $attribute) { + $link = $reader->getAttribute('Target'); + break; + } + } + } + } + } + + if($link != "") { + $ret['open'] = ""; + $ret['close'] = ""; + } + + return $ret; + } + + + /** + * PROCESS TABLE CONTENT + * + * @param XML $xml The XML node + * @return THe HTML code of the table + */ + private function checkTableFormating(&$xml) + { + $table = ""; + + while ($xml->read()) { + if ($xml->nodeType == XMLREADER::ELEMENT && $xml->name === 'w:tr') { //table row + $tc = $ts = ""; + + + $tr = new XMLReader; + $tr->xml(trim($xml->readOuterXML())); + + while ($tr->read()) { + if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tcPr') { //table element properties + $ts = $this->processTableStyle(trim($tr->readOuterXML())); + } + if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tc') { //table column + $tc .= $this->processTableRow(trim($tr->readOuterXML())); + } + } + $table .= ''.$tc.''; + } + } + + $table .= "
"; + return $table; + } + + /** + * PROCESS THE TABLE ROW STYLE + * + * @param string $content The XML node content + * @return THe HTML code of the table + */ + private function processTableStyle($content) + { + /*border-collapse:collapse; + border-bottom:4px dashed #0000FF; + border-top:6px double #FF0000; + border-left:5px solid #00FF00; + border-right:5px solid #666666;*/ + + $tc = new XMLReader; + $tc->xml($content); + $style = "border-collapse:collapse;"; + + while ($tc->read()) { + if ($tc->name === "w:tcBorders") { + $tc2 = new SimpleXMLElement($tc->readOuterXML()); + + foreach ($tc2->children('w',true) as $ch) { + if (in_array($ch->getName(), ['left','top','botom','right']) ) { + $line = $this->convertLine($ch['val']); + $style .= " border-".$ch->getName().":".$ch['sz']."px $line #".$ch['color'].";"; + } + } + + $tc->next(); + } + } + return $style; + } + + private function convertLine($in) + { + if (in_array($in, ['dotted'])) + return "dashed"; + + if (in_array($in, ['dotDash','dotdotDash','dotted','dashDotStroked','dashed','dashSmallGap'])) + return "dashed"; + + if (in_array($in, ['double','triple','threeDEmboss','threeDEngrave','thick'])) + return "double"; + + if (in_array($in, ['nil','none'])) + return "none"; + + return "solid"; + } + + /** + * PROCESS THE TABLE ROW + * + * @param string $content The XML node content + * @return THe HTML code of the table + */ + private function processTableRow($content) + { + $tc = new XMLReader; + $tc->xml($content); + $ct = ""; + + while ($tc->read()) { + if ($tc->name === "w:r") { + $ct .= "".$this->checkFormating($tc).""; + $tc->next(); + } + } + return $ct; + } + + /** + * READS THE GIVEN DOCX FILE INTO HTML FORMAT + * + * @param String $filename The DOCX file name + * @return String With HTML code + */ + public function readDocument($filename) + { + $this->file = $filename; + $this->readZipPart($filename); + $reader = new XMLReader(); + $reader->XML($this->doc_xml->saveXML()); + + $text = ''; $list_format=""; + + $formatting['header'] = 0; + // loop through docx xml dom + while ($reader->read()) { + // look for new paragraphs + $paragraph = new XMLReader; + $p = $reader->readOuterXML(); + + if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p') { + // set up new instance of XMLReader for parsing paragraph independantly + $paragraph->xml($p); + + preg_match('/ 0) ? '' : '

'; + + // loop through paragraph dom + while ($paragraph->read()) { + // look for elements + if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r') { + if($list_format == "") + $text .= $this->checkFormating($paragraph); + else { + $text .= $list_format['open']; + $text .= $this->checkFormating($paragraph); + $text .= $list_format['close']; + } + $list_format =""; + $paragraph->next(); + } + else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:pPr') { //lists + $list_format = $this->getListFormating($paragraph); + $paragraph->next(); + } + else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:drawing') { //images + $text .= $this->checkImageFormating($paragraph); + $paragraph->next(); + } + else if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:hyperlink') { + $hyperlink = $this->getHyperlink($paragraph); + $text .= $hyperlink['open']; + $text .= $this->checkFormating($paragraph); + $text .= $hyperlink['close']; + $paragraph->next(); + } + } + $text .= ($formatting['header'] > 0) ? '' : '

'; + } + else if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:tbl') { //tables + $paragraph->xml($p); + $text .= $this->checkTableFormating($paragraph); + $reader->next(); + } + } + $reader->close(); + if($this->debug) { + echo "
"; + echo mb_convert_encoding($text, $this->encoding); + echo "
"; + } + return mb_convert_encoding($text, $this->encoding); + } +} \ No newline at end of file