summaryrefslogtreecommitdiff
path: root/lib/querypath/src/QueryPath.php
blob: 20133777e5473130145a337639073024a0b02631 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
<?php
/** @file
 * The Query Path package provides tools for manipulating a structured document.
 * Typically, the sort of structured document is one using a Document Object Model
 * (DOM).
 * The two major DOMs are the XML DOM and the HTML DOM. Using Query Path, you can
 * build, parse, search, and modify DOM documents.
 *
 * To use QueryPath, only one file must be imported: qp.php. This file defines
 * the `qp()` function, and also registers an autoloader if necessary.
 *
 * Standard usage:
 * @code
 * <?php
 * require 'qp.php';
 *
 * $xml = '<?xml version="1.0"?><test><foo id="myID"/></test>';
 *
 * // Procedural call a la jQuery:
 * $qp = qp($xml, '#myID');
 * $qp->append('<new><elements/></new>')->writeHTML();
 *
 * // Object-oriented version with a factory:
 * $qp = QueryPath::with($xml)->find('#myID')
 * $qp->append('<new><elements/></new>')->writeHTML();
 * ?>
 * @endcode
 *
 * The above would print (formatted for readability):
 * @code
 * <?xml version="1.0"?>
 * <test>
 *  <foo id="myID">
 *    <new>
 *      <element/>
 *    </new>
 *  </foo>
 * </test>
 * @endcode
 *
 * ## Discovering the Library
 *
 * To gain familiarity with QueryPath, the following three API docs are
 * the best to start with:
 *
 *- qp(): This function constructs new queries, and is the starting point
 *  for manipulating a document. htmlqp() is an alias tuned for HTML
 *  documents (especially old HTML), and QueryPath::with(), QueryPath::withXML()
 *  and QueryPath::withHTML() all perform a similar role, but in a purely
 *  object oriented way.
 *- QueryPath: This is the top-level class for the library. It defines the
 *  main factories and some useful functions.
 *- QueryPath::Query: This defines all of the functions in QueryPath. When
 *  working with HTML and XML, the QueryPath::DOMQuery is the actual
 *  implementation that you work with.
 *
 * Included with the source code for QueryPath is a complete set of unit tests
 * as well as some example files. Those are good resources for learning about
 * how to apply QueryPath's tools. The full API documentation can be generated
 * from these files using Doxygen, or you can view it online at
 * http://api.querypath.org.
 *
 * If you are interested in building extensions for QueryPath, see the
 * QueryPath and QueryPath::Extension classes. There you will find information on adding
 * your own tools to QueryPath.
 *
 * QueryPath also comes with a full CSS 3 selector implementation (now
 * with partial support for the current draft of the CSS 4 selector spec). If
 * you are interested in reusing that in other code, you will want to start
 * with QueryPath::CSS::EventHandler.php, which is the event interface for the parser.
 *
 * All of the code in QueryPath is licensed under an MIT-style license
 * license. All of the code is Copyright, 2012 by Matt Butcher.
 *
 * @author M Butcher <matt @aleph-null.tv>
 * @license MIT
 * @see QueryPath
 * @see qp()
 * @see http://querypath.org The QueryPath home page.
 * @see http://api.querypath.org An online version of the API docs.
 * @see http://technosophos.com For how-tos and examples.
 * @copyright Copyright (c) 2009-2012, Matt Butcher.
 * @version -UNSTABLE% (3.x.x)
 *
 */

use \Masterminds\HTML5;

/**
 *
 */
class QueryPath {
  /**
   * The version string for this version of QueryPath.
   *
   * Standard releases will be of the following form: <MAJOR>.<MINOR>[.<PATCH>][-STABILITY].
   *
   * Examples:
   * - 2.0
   * - 2.1.1
   * - 2.0-alpha1
   *
   * Developer releases will always be of the form dev-<DATE>.
   *
   * @since 2.0
   */
  const VERSION = '3.0.x';

  /**
   * Major version number.
   *
   * Examples:
   * - 3
   * - 4
   *
   * @since 3.0.1
   */
  const VERSION_MAJOR = 3;

  /**
   * This is a stub HTML 4.01 document.
   *
   * <b>Using {@link QueryPath::XHTML_STUB} is preferred.</b>
   *
   * This is primarily for generating legacy HTML content. Modern web applications
   * should use QueryPath::XHTML_STUB.
   *
   * Use this stub with the HTML familiy of methods (QueryPath::Query::html(),
   * QueryPath::Query::writeHTML(), QueryPath::Query::innerHTML()).
   */
  const HTML_STUB = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  <html lang="en">
  <head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  <title>Untitled</title>
  </head>
  <body></body>
  </html>';

  const HTML5_STUB = '<!DOCTYPE html>
    <html>
    <head>
    <title>Untitled</title>
    </head>
    <body></body>
    </html>';

  /**
   * This is a stub XHTML document.
   *
   * Since XHTML is an XML format, you should use XML functions with this document
   * fragment. For example, you should use {@link xml()}, {@link innerXML()}, and
   * {@link writeXML()}.
   *
   * This can be passed into {@link qp()} to begin a new basic HTML document.
   *
   * Example:
   * @code
   * $qp = qp(QueryPath::XHTML_STUB); // Creates a new XHTML document
   * $qp->writeXML(); // Writes the document as well-formed XHTML.
   * @endcode
   * @since 2.0
   */
  const XHTML_STUB = '<?xml version="1.0"?>
  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title>Untitled</title>
  </head>
  <body></body>
  </html>';


  public static function with($document = NULL, $selector = NULL, $options = array()) {
    $qpClass = isset($options['QueryPath_class']) ? $options['QueryPath_class'] : '\QueryPath\DOMQuery';

    $qp = new $qpClass($document, $selector, $options);
    return $qp;
  }

  public static function withXML($source = NULL, $selector = NULL, $options = array()) {
    $options += array(
      'use_parser' => 'xml',
    );
    return self::with($source, $selector, $options);
  }

  public static function withHTML($source = NULL, $selector = NULL, $options = array()) {
    // Need a way to force an HTML parse instead of an XML parse when the
    // doctype is XHTML, since many XHTML documents are not valid XML
    // (because of coding errors, not by design).

    $options += array(
      'ignore_parser_warnings' => TRUE,
      'convert_to_encoding' => 'ISO-8859-1',
      'convert_from_encoding' => 'auto',
      //'replace_entities' => TRUE,
      'use_parser' => 'html',
      // This is stripping actually necessary low ASCII.
      //'strip_low_ascii' => TRUE,
    );
    return @self::with($source, $selector, $options);
  }

  /**
   * Parse HTML5 documents.
   *
   * This uses HTML5-PHP to parse the document. In actuality, this parser does
   * a fine job with pre-HTML5 documents in most cases, though really old HTML
   * (like 2.0) may have some substantial quirks.
   *
   * <b>Supported Options</b>
   * Any options supported by HTML5-PHP are allowed here. Additionally, the
   * following options have meaning to QueryPath.
   * - QueryPath_class
   *
   *
   * @param mixed $source
   *  A document as an HTML string, or a path/URL. For compatibility with
   *  existing functions, a DOMDocument, SimpleXMLElement, DOMNode or array
   *  of DOMNodes will be passed through as well. However, these types are not
   *  validated in any way.
   *
   * @param string $selector
   *  A CSS3 selector.
   *
   * @param array $options
   *   An associative array of options, which is passed on into HTML5-PHP. Note
   *   that the standard QueryPath options may be ignored for this function,
   *   since it uses a different parser.
   *
   * @return QueryPath
   */
  public static function withHTML5($source = NULL, $selector = NULL, $options = array()) {
    $qpClass = isset($options['QueryPath_class']) ? $options['QueryPath_class'] : '\QueryPath\DOMQuery';

    if(is_string($source)) {
      $html5 = new HTML5();
      if (strpos($source, '<') !== FALSE && strpos($source, '>') !== FALSE) {
        $source = $html5->loadHTML($source);
      }
      else {
        $source = $html5->load($source);
      }
    }

    $qp = new $qpClass($source, $selector, $options);
    return $qp;
  }

  /**
   * Enable one or more extensions.
   *
   * Extensions provide additional features to QueryPath. To enable and 
   * extension, you can use this method.
   *
   * In this example, we enable the QPTPL extension:
   * @code
   * <?php
   * QueryPath::enable('\QueryPath\QPTPL');
   * ?>
   * @endcode
   *
   * Note that the name is a fully qualified class name.
   *
   * We can enable more than one extension at a time like this:
   *
   * @code
   * <?php
   * $extensions = array('\QueryPath\QPXML', '\QueryPath\QPDB');
   * QueryPath::enable($extensions);
   * ?>
   * @endcode
   *
   * @attention If you are not using an autoloader, you will need to
   * manually `require` or `include` the files that contain the
   * extensions.
   *
   * @param mixed $extensionNames
   *   The name of an extension or an array of extension names.
   *   QueryPath assumes that these are extension class names,
   *   and attempts to register these as QueryPath extensions.
   */
  public static function enable($extensionNames) {

    if (is_array($extensionNames)) {
      foreach ($extensionNames as $extension) {
        \QueryPath\ExtensionRegistry::extend($extension);
      }
    }
    else {
      \QueryPath\ExtensionRegistry::extend($extensionNames);
    }
  }

  /**
   * Get a list of all of the enabled extensions.
   *
   * This example dumps a list of extensions to standard output:
   * @code
   * <?php
   * $extensions = QueryPath::enabledExtensions();
   * print_r($extensions);
   * ?>
   * @endcode
   *
   * @return array
   *   An array of extension names.
   *
   * @see QueryPath::ExtensionRegistry
   */
  public static function enabledExtensions() {
    return \QueryPath\ExtensionRegistry::extensionNames();
  }



  /**
   * A static function for transforming data into a Data URL.
   *
   * This can be used to create Data URLs for injection into CSS, JavaScript, or other
   * non-XML/HTML content. If you are working with QP objects, you may want to use
   * dataURL() instead.
   *
   * @param mixed $data
   *  The contents to inject as the data. The value can be any one of the following:
   *  - A URL: If this is given, then the subsystem will read the content from that URL. THIS
   *    MUST BE A FULL URL, not a relative path.
   *  - A string of data: If this is given, then the subsystem will encode the string.
   *  - A stream or file handle: If this is given, the stream's contents will be encoded
   *    and inserted as data.
   *  (Note that we make the assumption here that you would never want to set data to be
   *  a URL. If this is an incorrect assumption, file a bug.)
   * @param string $mime
   *  The MIME type of the document.
   * @param resource $context
   *  A valid context. Use this only if you need to pass a stream context. This is only necessary
   *  if $data is a URL. (See {@link stream_context_create()}).
   * @return
   *  An encoded data URL.
   */
  public static function encodeDataURL($data, $mime = 'application/octet-stream', $context = NULL) {
    if (is_resource($data)) {
      $data = stream_get_contents($data);
    }
    elseif (filter_var($data, FILTER_VALIDATE_URL)) {
      $data = file_get_contents($data, FALSE, $context);
    }

    $encoded = base64_encode($data);

    return 'data:' . $mime . ';base64,' . $encoded;
  }

}