Overview

Packages

  • PHP
  • vsword
    • node
    • parser
      • addesed
    • structure
      • style

Classes

  • DefaultInitNode
  • HTMLLoader
  • HtmlParser
  • Parser
  • Overview
  • Package
  • Class
  • Tree
  1: <?php
  2: 
  3:  
  4: 
  5: /**
  6:  * Class HTMLLoader
  7:  *
  8:  * @version 1.0.2
  9:  * @author v.raskin
 10:  * @package vsword.parser
 11:  */
 12: class HTMLLoader {
 13: 
 14:     protected $validateAttribute = FALSE;
 15:     
 16:     
 17:     
 18:     /**
 19:     * @return boolean
 20:     */
 21:     public function noEmptyText($text) {
 22:         return trim($text) != '';
 23:     }
 24:     
 25:  
 26:     
 27: 
 28:     
 29:     public function parseFromUrl($url) {
 30:         $content = file_get_contents($url);  
 31:         $content = preg_replace('/<!--(.*?)-->/is', '',$content );  
 32:         $content = preg_replace('/<script.*?>(.*?)<\/script>/is', '',$content );
 33:         $content = preg_replace('/<style.*?>(.*?)<\/style>/is', '',$content );
 34:         preg_match('/<html.*?>(.*?)<\/html>/is', $content, $match);  
 35:         $html = (isset($match[1])) ? $match[1] : $content; 
 36:         return $this->parse($html);
 37:     }
 38:     
 39:     /**
 40:      * 
 41:      */
 42:     public function parse($html) {
 43:         $html = htmlspecialchars_decode($html, ENT_QUOTES);
 44:         $i = 0;
 45:         $length = strlen($html);
 46:         $target = new ArbitraryCompositeNode('document');
 47:         $open = false;
 48:         $end = false;
 49:         $content = '';
 50:         $eatAttr = false;
 51:         $stringTag = '';
 52:         $attributeStr = '';
 53:             
 54:         while($length > $i) {
 55:             $char = substr($html, $i ++, 1);
 56:             if($char == '<') { 
 57:                 if($this->noEmptyText($content)) { 
 58:                     $target->addNode(new StringNode($content));
 59:                 }
 60:                 $content = '';
 61:                 $open = true;
 62:                 $end = false;
 63:                 if(substr($html, $i, 1) == '/') { 
 64:                     $end = true;
 65:                     $i ++;
 66:                 }
 67:             } else if($open  && $char == '>') {
 68:                  
 69:                 if($end ) { //close tag 
 70:                     if($this->noEmptyText($content)) {
 71:                         $target->addNode(new StringNode($content));
 72:                     }
 73:                     $content = '';
 74:                     if(!is_null($target->getParent())) {
 75:                         $target = $target->getParent();
 76:                     }
 77:                 } else { 
 78:                     if($this->isSingleNode($stringTag)) {
 79:                         $node = new ArbitraryNode($stringTag, $this->attributeStrToArray($attributeStr));
 80:                         $target->addNode($node);
 81:                     } else {
 82:                         $node = new ArbitraryCompositeNode($stringTag, $this->attributeStrToArray($attributeStr));
 83:                         $target->addNode($node);
 84:                         $target = $node;
 85:                     } 
 86:                      
 87:                 }
 88:                 
 89:                 $open = false;
 90:                 $end = false;        
 91:                 $stringTag = '';
 92:                 $eatAttr = false; 
 93:                 $attributeStr = '';
 94:             
 95:             }  else if($open && !$eatAttr && preg_match('/[a-zA-Z0-9]/', $char)) {
 96:                 $stringTag .= $char;
 97:             }  else if($open) {
 98:                 $eatAttr = true;
 99:                 $attributeStr .= $char;
100:             } else if(!$open) {
101:                 $content .= $char; 
102:             }
103:         }
104:         if($this->noEmptyText($content)) {
105:             $target->addNode(new StringNode($content));
106:         }
107:             
108:         return $this->lastResult = $target;
109:             
110:     }
111:     
112:     /**
113:     * @return boolean
114:     */
115:     public function isSingleNode($stringTag) {
116:         return in_array(strtolower($stringTag), array('br', 'hr', 'meta', 'link', 'input', 'img',));
117:     }
118:     
119:     /**
120:     * @param string $attributeStr
121:     * @return array
122:     */
123:     protected function attributeStrToArray($attributeStr) { 
124:         $attr = array();
125:         $attributeStr = trim($attributeStr);
126:         $l = strlen($attributeStr);
127:         $key = '';
128:         $value = '';
129:         $state = 0;
130:         for($i = 0;  $i < $l; $i ++) {
131:             $char = substr($attributeStr, $i, 1);
132:             if($state == 0 && $char == '=') {
133:                 $state = 1;
134:             } else if($state == 1 && $char == '"') {
135:                 $state = 2;
136:             } else if($state == 1 && $char == '\'') {
137:                 $state = 3;
138:             } else if(($state == 3 && $char == '\'') || ($state == 2 && $char == '"')) {
139:                 $attr[trim($key)] = $value; 
140:                 $key = '';
141:                 $value = '';
142:                 $state = 0;
143:             } else if($state == 2 || $state == 3) {
144:                 $value .= $char;
145:             } else if($state == 0) {
146:                 $key .= $char;
147:             } 
148:         }
149:         if($state != 0 && $this->validateAttribute) {
150:             throw new Exception('Attribute syntax error');
151:         }  
152:         return $attr;
153:     }
154:     
155:  
156: }
157: 
wordx API documentation generated by ApiGen 2.8.0