如何从word文件中提取文本 .doc,docx,.xlsx,.pptx php.



可能有一种情况我们需要从word文档中获取文本以供将来使用来搜索用户上传的文档中的字符串,例如在简历/简历中搜索并出现一个常见问题,即如何获取文本,打开并阅读用户上传的Word文档,有一些有用的链接但不能解决整个问题。我们需要在上传时获取文本并将文本保存在数据库中,我们可以轻松地在数据库中搜索。

这是一个简单的类,它可以为 .doc/.docx 做正确的工作,PHP docx阅读器:将MS Word Docx文件转换为文本

    class DocxConversion{
    private $filename;
    public function __construct($filePath) {
        $this->filename = $filePath;
    }
    private function read_doc() {
        $fileHandle = fopen($this->filename, "r");
        $line = @fread($fileHandle, filesize($this->filename));   
        $lines = explode(chr(0x0D),$line);
        $outtext = "";
        foreach($lines as $thisline)
          {
            $pos = strpos($thisline, chr(0x00));
            if (($pos !== FALSE)||(strlen($thisline)==0))
              {
              } else {
                $outtext .= $thisline." ";
              }
          }
         $outtext = preg_replace("/[^a-zA-Z0-9s,.-nrt@/_()]/","",$outtext);
        return $outtext;
    }
    private function read_docx(){
        $striped_content = '';
        $content = '';
        $zip = zip_open($this->filename);
        if (!$zip || is_numeric($zip)) return false;
        while ($zip_entry = zip_read($zip)) {
            if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
            if (zip_entry_name($zip_entry) != "word/document.xml") continue;
            $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
            zip_entry_close($zip_entry);
        }// end while
        zip_close($zip);
        $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
        $content = str_replace('</w:r></w:p>', "rn", $content);
        $striped_content = strip_tags($content);
        return $striped_content;
    }
 /************************excel sheet************************************/
function xlsx_to_text($input_file){
    $xml_filename = "xl/sharedStrings.xml"; //content file name
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
        if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text = strip_tags($xml_handle->saveXML());
        }else{
            $output_text .="";
        }
        $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}
/*************************power point files*****************************/
function pptx_to_text($input_file){
    $zip_handle = new ZipArchive;
    $output_text = "";
    if(true === $zip_handle->open($input_file)){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index);
            $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= strip_tags($xml_handle->saveXML());
            $slide_number++;
        }
        if($slide_number == 1){
            $output_text .="";
        }
        $zip_handle->close();
    }else{
    $output_text .="";
    }
    return $output_text;
}

    public function convertToText() {
        if(isset($this->filename) && !file_exists($this->filename)) {
            return "File Not exists";
        }
        $fileArray = pathinfo($this->filename);
        $file_ext  = $fileArray['extension'];
        if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
        {
            if($file_ext == "doc") {
                return $this->read_doc();
            } elseif($file_ext == "docx") {
                return $this->read_docx();
            } elseif($file_ext == "xlsx") {
                return $this->xlsx_to_text();
            }elseif($file_ext == "pptx") {
                return $this->pptx_to_text();
            }
        } else {
            return "Invalid File Type";
        }
    }
}

Document_file_format文档文件是二进制 blob。可以使用 fopen 读取它们。虽然.docx文件只是zip文件和xml文件xml文件在zipfile容器(源维基百科)中,但您可以使用zip_open读取它们。

以上类的用法

$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
echo $docText= $docObj->convertToText();

来自 DOC 文件

$filename = 'ypue file';         
    if ( file_exists($filename) ) {        
    if ( ($fh = fopen($filename, 'r')) !== false ) {
    $headers = fread($fh, 0xA00);
    $n1 = ( ord($headers[0x21C]) - 1 );
    $n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );
    $n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );
    $n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );

    $textLength = ($n1 + $n2 + $n3 + $n4);
    $extracted_plaintext = fread($fh, $textLength);
    echo nl2br($extracted_plaintext);
    print_r(extract_emails_from($extracted_plaintext));    
         }
        }
    function extract_emails_from($string) {
             preg_match_all("/[._a-zA-Z0-9-]+@[._a-zA-Z0-9-]+/i", $string, $matches);
             return $matches[0];
    }

来自 DOCX :

    /*Name of the document file*/
    $document = 'your file';
    /**Function to extract text*/
    function extracttext($filename) {
        //Check for extension
        $ext = end(explode('.', $filename));
    //if its docx file
    if($ext == 'docx')
    $dataFile = "word/document.xml";
    //else it must be odt file
    else
    $dataFile = "content.xml";     
    //Create a new ZIP archive object
    $zip = new ZipArchive;
    // Open the archive file
    if (true === $zip->open($filename)) {
        // If successful, search for the data file in the archive
        if (($index = $zip->locateName($dataFile)) !== false) {
            // Index found! Now read it to a string
            $text = $zip->getFromIndex($index);
            // Load XML from a string
            // Ignore errors and warnings
            $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            // Remove XML formatting tags and return the text
            return strip_tags($xml->saveXML());
        }
        //Close the archive file
        $zip->close();
    }
    // In case of failure return a message
    return "File not found";
}
echo extracttext($document);

//For DOCX.如果要保留空格,还要注意表 tr 和 tc,请使用以下代码: 根据您的喜好修改它。因为它从远程或本地下载文件

//=========DOCX===========
function extractDocxText($url,$file_name){
        $docx = get_url($url);
        file_put_contents("tempf.docx",$docx);
        $xml_filename = "word/document.xml"; //content file name
        $zip_handle = new ZipArchive;
        $output_text = "";
        if(true === $zip_handle->open("tempf.docx")){
            if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){
                $xml_datas = $zip_handle->getFromIndex($xml_index);
                //file_put_contents($input_file.".xml",$xml_datas);
                $replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"nr",$xml_datas);
                $replace_tableRows = preg_replace('/<w:tr>/',"nr",$replace_newlines);
                $replace_tab = preg_replace('/<w:tab/>/',"t",$replace_tableRows);
                $replace_paragraphs = preg_replace('/</w:p>/',"nr",$replace_tab);
                $replace_other_Tags = strip_tags($replace_paragraphs);          
                $output_text = $replace_other_Tags;
            }else{
                $output_text .="";
            }
            $zip_handle->close();
        }else{
        $output_text .=" ";
        }
        chmod("tempf.docx", 0777);  unlink(realpath("tempf.docx"));
        //save to file or echo content
        file_put_contents($file_name,$output_text);
        echo $output_text;
    }
//========PDF===========
//Requires installation in your Linux server
//sudo su
//apt-get install xpdf
function extractPdfText($url,$PDF_fullpath_or_Filename){
    $pdf = get_url($url);
    file_put_contents ("temppdf.txt", $pdf);
    $content = pdf2text("temppdf.txt");
    chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt"));
    echo $content;
    file_put_contents($PDF_fullpath_or_Filename,$content);
    }

//========DOC==========
function extractDocText($url,$file_name){
    $doc = get_url($url);
    file_put_contents ("tempf.txt", $doc);
    $fileHandle = fopen("tempf.txt", "r");
    $line = @fread($fileHandle, filesize("tempf.txt"));
    $lines = explode(chr(0x0D),$line);
    $outtext = "";
    foreach($lines as $thisline){
        $pos = strpos($thisline, chr(0x00));
        if (($pos !== FALSE)||(strlen($thisline)==0))
        {} else {$outtext .= $thisline."nr";}
        }
    $content = preg_replace('/[a-zA-Z0-9s,.-nrt@/_()]/','  ',$outtext);
    //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt"));
    echo $content;
    file_put_contents($file_name,$content);
    }

//========XLSX==========
function extractXlsxText($url,$file_name){
    $xlsx = get_url($url);
    file_put_contents ("tempf.txt", $xlsx);
    $content = "";
    $dir = 'tempforxlsx';
    // Unzip
    $zip = new ZipArchive();
    $zip->open("tempf.txt");
    $zip->extractTo($dir);
    // Open up shared strings & the first worksheet
    $strings = simplexml_load_file($dir . '/xl/sharedStrings.xml');
    $sheet   = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml');
    // Parse the rows
    $xlrows = $sheet->sheetData->row;
    foreach ($xlrows as $xlrow) {
        $arr = array();
        // In each row, grab it's value
        foreach ($xlrow->c as $cell) {
            $v = (string) $cell->v;
            // If it has a "t" (type?) of "s" (string?), use the value to look up string value
            if (isset($cell['t']) && $cell['t'] == 's') {
                $s  = array();
                $si = $strings->si[(int) $v];
                // Register & alias the default namespace or you'll get empty results in the xpath query
                $si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main');
                // Cat together all of the 't' (text?) node values
                foreach($si->xpath('.//n:t') as $t) {
                    $content .= $t."  ";}   }
            }
        }
    echo $content;
    file_put_contents($file_name,$content);
    }

//========PPT========== 
function extractPptText($url,$file_name){
    $ppt = file_get_contents($url);
    file_put_contents ("tempf.ppt", $ppt);
    $fileHandle = fopen("tempf.ppt", "r");
    $line = @fread($fileHandle, filesize("tempf.ppt"));
    $lines = explode(chr(0x0f),$line);
    $outtext = '';
    foreach($lines as $thisline) {
        if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
            $text_line = substr($thisline, 4);
            $end_pos   = strpos($text_line, chr(0x00));
            $text_line = substr($text_line, 0, $end_pos);
            $text_line = preg_replace('/[^a-zA-Z0-9s,.-nrt@/_()]/',"  ",$text_line);
            $outtext = substr($text_line, 0, $end_pos)."n".$outtext;
        }
    }
    //echo $outtext;
    file_put_contents($file_name,$outtext);
    }
//========PPTX==========
function extractPptxText($url,$file_name){
    $xls = get_url($url);
    file_put_contents ("tempf.txt", $xls);
    $zip_handle = new ZipArchive;
    $output_text = ' ';
    if(true === $zip_handle->open("tempf.txt")){
        $slide_number = 1; //loop through slide files
        while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
            $xml_datas = $zip_handle->getFromIndex($xml_index); // these four lines of codes
                                                                // below were
            $xml_handle = new DOMDocument ();                   // added by me in order
            $xml_handle->preserveWhiteSpace = true;             // to preserve space between
            $xml_handle->formatOutput = true;                   // each read data
            $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            $output_text .= $xml_handle->saveXML();
            $slide_number++;
            }
        if($slide_number == 1){
            $output_text .= "";
        }
        $zip_handle->close();
    }else{
    $output_text .= "";
    }
    echo $output_text;
    file_put_contents($file_name,$output_text);
    }
    /*
==========================================================================
=========================================================================
And below is get_url() function: Better than fie_get_contents();
*/
function get_url( $url,$timeout = 5 )
    {
        $url = str_replace( "&amp;", "&", urldecode(trim($url)) );
        $ch = curl_init();
        curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
        curl_setopt( $ch, CURLOPT_URL, $url );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
        curl_setopt( $ch, CURLOPT_ENCODING, "" );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
        curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
        curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false );    # required for https urls
        curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
        curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
        $content = curl_exec( $ch );
        //$response = curl_getinfo( $ch ); 
        curl_close ( $ch );
        return $content;
    }

您可以使用 PHPOffice 在 PHP 中读取 docx 内容。

    $content = '';
    $phpWord = PhpOfficePhpWordIOFactory::load($r->file);
    foreach($phpWord->getSections() as $section) {
        foreach($section->getElements() as $element) {
            if (method_exists($element, 'getElements')) {
                foreach($element->getElements() as $childElement) {
                    if (method_exists($childElement, 'getText')) {
                        $content .= $childElement->getText() . ' ';
                    }
                    else if (method_exists($childElement, 'getContent')) {
                        $content .= $childElement->getContent() . ' ';
                    }
                }
            }
            else if (method_exists($element, 'getText')) {
                $content .= $element->getText() . ' ';
            }
        }
    }

然后print_r$content以查看字符串形式的内容。

对于 docx 文档,我建议使用docx2txt工具(至少在 Debian/Ubuntu 上可用):

docx2txt < your_file.docx

README解释如何将其与 vim 集成。添加到您的.vimrc

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly.
autocmd BufReadPre *.docx set ro
autocmd BufReadPost *.docx %!docx2txt

(它还解释了如何与 Emacs 集成)。

对于黑客来说,这个工具是用perl编写的。

相关内容

  • 没有找到相关文章

最新更新