1: <?php
2: 3: 4: 5: 6: 7: 8:
9: class HtmlParser extends Parser {
10:
11: 12: 13:
14: protected $word;
15:
16: protected $currentHTMLNode;
17:
18: 19: 20:
21: public function __construct(VsWord $word) {
22: $this->word = $word;
23: $this->addHandlerInitNode(new DefaultInitNode( $this->word ));
24: }
25:
26: 27: 28:
29: public function parse($html) {
30: $html = $this->stripString($html);
31: $loader = new HTMLLoader();
32: $dom = $loader->parse($html);
33:
34: $body = $this->word->getDocument()->getBody();
35: $this->translate($dom, $body);
36: }
37:
38: protected function translate($ntmlNode, $wordNode) {
39: $this->currentHTMLNode = $ntmlNode;
40: if($ntmlNode instanceof StringNode) {
41: $this->addText($ntmlNode->getText(), $wordNode);
42: return;
43: }
44: $node = $this->initNode($ntmlNode->getName(), $ntmlNode->getAttributes());
45: $addeded = NodeAddeded::init($node, $wordNode, $this);
46:
47: if($ntmlNode instanceof CompositeNode) {
48: foreach($ntmlNode->getChildrens() as $nNode) {
49: $this->translate($nNode, $addeded->getNewNode());
50: }
51: }
52: }
53:
54:
55:
56: public function getCurrentHTMLNode() {
57: return $this->currentHTMLNode;
58: }
59:
60: 61: 62:
63: public function noEmptyText($text) {
64: return trim($text) != '';
65: }
66:
67: 68: 69:
70: public function parseFromUrl($url) {
71: $content = file_get_contents($url);
72: $content = preg_replace('/<!--(.*?)-->/is', '',$content );
73: $content = preg_replace('/<script.*?>(.*?)<\/script>/is', '',$content );
74: $content = preg_replace('/<style.*?>(.*?)<\/style>/is', '',$content );
75: preg_match('/<body.*?>(.*?)<\/body>/is', $content, $match);
76: $html = $match[1];
77: $this->parse($html);
78: }
79:
80: 81: 82:
83: protected function stripString($html) {
84: str_replace(array(' ', '"', '«', '©', '»'), array(' ', "'", '"', '©', '"'), $html);
85: return preg_replace('/\&[a-zA-Z0-9]{1,}\;/is', '', $html);
86: }
87:
88:
89: 90: 91: 92: 93:
94: protected function addText($text, $node) {
95: do {
96: if($node instanceof INodeTextAdded) {
97: $nText = ($node->addText($text));
98: if($nText instanceof RCompositeNode ) {
99: $nText->getParent()->clearTextStyle();
100: }
101: return $nText;
102: }
103: } while(!is_null($node = $node->getParent()));
104: return false;
105: }
106:
107: }