Overview

Packages

  • PHP
  • vsword
    • node
    • parser
      • addesed
    • structure
      • style

Classes

  • DefaultInitNode
  • HTMLLoader
  • HtmlParser
  • Parser
  • Overview
  • Package
  • Class
  • Tree
  1: <?php
  2: /**
  3: * Class HtmlParser
  4: *
  5: *  @version 1.0.2
  6: *  @author v.raskin
  7:  * @package vsword.parser
  8: */
  9: class HtmlParser extends Parser {
 10:     
 11:     /**
 12:     * @var VsWord
 13:     */
 14:     protected $word;
 15:     
 16:     protected $currentHTMLNode;
 17: 
 18:     /**
 19:     * @param VsWord $word
 20:     */
 21:     public function __construct(VsWord $word) {
 22:         $this->word = $word;
 23:         $this->addHandlerInitNode(new DefaultInitNode( $this->word ));
 24:     }
 25:     
 26:     /**
 27:     * @param string $html
 28:     */
 29:     public function parse($html) {
 30:         $html = $this->stripString($html); 
 31:         $loader = new HTMLLoader();
 32:         $dom = $loader->parse($html);  
 33:         //echo '<pre>'.$dom->look().'</pre>';
 34:         $body = $this->word->getDocument()->getBody();
 35:         $this->translate($dom, $body);
 36:     }
 37:     
 38:     protected function translate($ntmlNode, $wordNode) {
 39:         $this->currentHTMLNode = $ntmlNode;
 40:         if($ntmlNode instanceof StringNode) {
 41:             $this->addText($ntmlNode->getText(), $wordNode);
 42:             return;
 43:         }
 44:         $node = $this->initNode($ntmlNode->getName(), $ntmlNode->getAttributes()); 
 45:         $addeded = NodeAddeded::init($node, $wordNode, $this);
 46:         
 47:         if($ntmlNode instanceof CompositeNode) {
 48:             foreach($ntmlNode->getChildrens() as $nNode) {
 49:                 $this->translate($nNode, $addeded->getNewNode());
 50:             }
 51:         }
 52:     }
 53:     
 54:     
 55:     
 56:     public function getCurrentHTMLNode() {
 57:         return $this->currentHTMLNode;
 58:     }
 59:     
 60:     /**
 61:     * @return boolean
 62:     */
 63:     public function noEmptyText($text) {
 64:         return trim($text) != '';
 65:     }
 66:     
 67:     /**
 68:     * 
 69:     */
 70:     public function parseFromUrl($url) {
 71:         $content = file_get_contents($url);
 72:         $content = preg_replace('/<!--(.*?)-->/is', '',$content );  
 73:         $content = preg_replace('/<script.*?>(.*?)<\/script>/is', '',$content );
 74:         $content = preg_replace('/<style.*?>(.*?)<\/style>/is', '',$content );
 75:         preg_match('/<body.*?>(.*?)<\/body>/is', $content, $match);  
 76:         $html = $match[1];
 77:         $this->parse($html);
 78:     }
 79:     
 80:     /**
 81:     * @return string
 82:     */
 83:     protected function stripString($html) {
 84:         str_replace(array('&nbsp;', '&quot;', '&laquo;', '&copy;', '&raquo;'), array(' ', "'", '"', '©', '"'), $html);
 85:         return  preg_replace('/\&[a-zA-Z0-9]{1,}\;/is', '', $html);  
 86:     }
 87: 
 88:     
 89:     /**
 90:     * @param string $text
 91:     * @param Node $node
 92:     * @return boolean
 93:     */
 94:     protected function addText($text, $node) { 
 95:         do {
 96:             if($node instanceof INodeTextAdded) {
 97:                 $nText = ($node->addText($text));   
 98:                 if($nText instanceof RCompositeNode ) {
 99:                     $nText->getParent()->clearTextStyle();
100:                 }
101:                 return $nText;
102:             }
103:         } while(!is_null($node = $node->getParent()));
104:         return false;
105:     }
106:      
107: }
wordx API documentation generated by ApiGen 2.8.0