1: <?php
2:
3:
4:
5: 6: 7: 8: 9: 10: 11:
12: class HTMLLoader {
13:
14: protected $validateAttribute = FALSE;
15:
16:
17:
18: 19: 20:
21: public function noEmptyText($text) {
22: return trim($text) != '';
23: }
24:
25:
26:
27:
28:
29: public function parseFromUrl($url) {
30: $content = file_get_contents($url);
31: $content = preg_replace('/<!--(.*?)-->/is', '',$content );
32: $content = preg_replace('/<script.*?>(.*?)<\/script>/is', '',$content );
33: $content = preg_replace('/<style.*?>(.*?)<\/style>/is', '',$content );
34: preg_match('/<html.*?>(.*?)<\/html>/is', $content, $match);
35: $html = (isset($match[1])) ? $match[1] : $content;
36: return $this->parse($html);
37: }
38:
39: 40: 41:
42: public function parse($html) {
43: $html = htmlspecialchars_decode($html, ENT_QUOTES);
44: $i = 0;
45: $length = strlen($html);
46: $target = new ArbitraryCompositeNode('document');
47: $open = false;
48: $end = false;
49: $content = '';
50: $eatAttr = false;
51: $stringTag = '';
52: $attributeStr = '';
53:
54: while($length > $i) {
55: $char = substr($html, $i ++, 1);
56: if($char == '<') {
57: if($this->noEmptyText($content)) {
58: $target->addNode(new StringNode($content));
59: }
60: $content = '';
61: $open = true;
62: $end = false;
63: if(substr($html, $i, 1) == '/') {
64: $end = true;
65: $i ++;
66: }
67: } else if($open && $char == '>') {
68:
69: if($end ) {
70: if($this->noEmptyText($content)) {
71: $target->addNode(new StringNode($content));
72: }
73: $content = '';
74: if(!is_null($target->getParent())) {
75: $target = $target->getParent();
76: }
77: } else {
78: if($this->isSingleNode($stringTag)) {
79: $node = new ArbitraryNode($stringTag, $this->attributeStrToArray($attributeStr));
80: $target->addNode($node);
81: } else {
82: $node = new ArbitraryCompositeNode($stringTag, $this->attributeStrToArray($attributeStr));
83: $target->addNode($node);
84: $target = $node;
85: }
86:
87: }
88:
89: $open = false;
90: $end = false;
91: $stringTag = '';
92: $eatAttr = false;
93: $attributeStr = '';
94:
95: } else if($open && !$eatAttr && preg_match('/[a-zA-Z0-9]/', $char)) {
96: $stringTag .= $char;
97: } else if($open) {
98: $eatAttr = true;
99: $attributeStr .= $char;
100: } else if(!$open) {
101: $content .= $char;
102: }
103: }
104: if($this->noEmptyText($content)) {
105: $target->addNode(new StringNode($content));
106: }
107:
108: return $this->lastResult = $target;
109:
110: }
111:
112: 113: 114:
115: public function isSingleNode($stringTag) {
116: return in_array(strtolower($stringTag), array('br', 'hr', 'meta', 'link', 'input', 'img',));
117: }
118:
119: 120: 121: 122:
123: protected function attributeStrToArray($attributeStr) {
124: $attr = array();
125: $attributeStr = trim($attributeStr);
126: $l = strlen($attributeStr);
127: $key = '';
128: $value = '';
129: $state = 0;
130: for($i = 0; $i < $l; $i ++) {
131: $char = substr($attributeStr, $i, 1);
132: if($state == 0 && $char == '=') {
133: $state = 1;
134: } else if($state == 1 && $char == '"') {
135: $state = 2;
136: } else if($state == 1 && $char == '\'') {
137: $state = 3;
138: } else if(($state == 3 && $char == '\'') || ($state == 2 && $char == '"')) {
139: $attr[trim($key)] = $value;
140: $key = '';
141: $value = '';
142: $state = 0;
143: } else if($state == 2 || $state == 3) {
144: $value .= $char;
145: } else if($state == 0) {
146: $key .= $char;
147: }
148: }
149: if($state != 0 && $this->validateAttribute) {
150: throw new Exception('Attribute syntax error');
151: }
152: return $attr;
153: }
154:
155:
156: }
157: