summaryrefslogtreecommitdiff
path: root/lib/querypath/src/QueryPath/Entities.php
blob: 5670dc5101cdd3b54a7a6c483089bdbb10b5d58d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<?php
/**
 * @file
 * HTML entity utilities.
 */

namespace QueryPath;

/**
 * Perform various tasks on HTML/XML entities.
 *
 * @ingroup querypath_util
 */
class Entities {

  /**
   * This is three regexes wrapped into 1. The | divides them.
   * 1: Match any char-based entity. This will go in $matches[1]
   * 2: Match any num-based entity. This will go in $matches[2]
   * 3: Match any hex-based entry. This will go in $matches[3]
   * 4: Match any ampersand that is not an entity. This goes in $matches[4]
   *    This last rule will only match if one of the previous two has not already
   *    matched.
   * XXX: Are octal encodings for entities acceptable?
   */
  //protected static $regex = '/&([\w]+);|&#([\d]+);|&([\w]*[\s$]+)/m';
  protected static $regex = '/&([\w]+);|&#([\d]+);|&#(x[0-9a-fA-F]+);|(&)/m';

  /**
   * Replace all entities.
   * This will scan a string and will attempt to replace all
   * entities with their numeric equivalent. This will not work
   * with specialized entities.
   *
   * @param string $string
   *  The string to perform replacements on.
   * @return string
   *  Returns a string that is similar to the original one, but with
   *  all entity replacements made.
   */
  public static function replaceAllEntities($string) {
    return preg_replace_callback(self::$regex, '\QueryPath\Entities::doReplacement', $string);
  }

  /**
   * Callback for processing replacements.
   *
   * @param array $matches
   *  The regular expression replacement array.
   */
  protected static function doReplacement($matches) {
    // See how the regex above works out.
    //print_r($matches);

    // From count, we can tell whether we got a
    // char, num, or bare ampersand.
    $count = count($matches);
    switch ($count) {
      case 2:
        // We have a character entity
        return '&#' . self::replaceEntity($matches[1]) . ';';
      case 3:
      case 4:
        // we have a numeric entity
        return '&#' . $matches[$count-1] . ';';
      case 5:
        // We have an unescaped ampersand.
        return '&#38;';
    }
  }

  /**
   * Lookup an entity string's numeric equivalent.
   *
   * @param string $entity
   *  The entity whose numeric value is needed.
   * @return int
   *  The integer value corresponding to the entity.
   * @author Matt Butcher
   * @author Ryan Mahoney
   */
  public static function replaceEntity($entity) {
    return self::$entity_array[$entity];
  }

  /**
   * Conversion mapper for entities in HTML.
   * Large entity conversion table. This is
   * significantly broader in range than
   * get_html_translation_table(HTML_ENTITIES).
   *
   * This code comes from Rhizome ({@link http://code.google.com/p/sinciput})
   *
   * @todo See if we can do this as a const.
   * @see get_html_translation_table()
   */
  private static $entity_array = array(
    'nbsp' => 160, 'iexcl' => 161, 'cent' => 162, 'pound' => 163,
    'curren' => 164, 'yen' => 165, 'brvbar' => 166, 'sect' => 167,
    'uml' => 168, 'copy' => 169, 'ordf' => 170, 'laquo' => 171,
    'not' => 172, 'shy' => 173, 'reg' => 174, 'macr' => 175, 'deg' => 176,
    'plusmn' => 177, 'sup2' => 178, 'sup3' => 179, 'acute' => 180,
    'micro' => 181, 'para' => 182, 'middot' => 183, 'cedil' => 184,
    'sup1' => 185, 'ordm' => 186, 'raquo' => 187, 'frac14' => 188,
    'frac12' => 189, 'frac34' => 190, 'iquest' => 191, 'Agrave' => 192,
    'Aacute' => 193, 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
    'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199, 'Egrave' => 200,
    'Eacute' => 201, 'Ecirc' => 202, 'Euml' => 203, 'Igrave' => 204,
    'Iacute' => 205, 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
    'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211, 'Ocirc' => 212,
    'Otilde' => 213, 'Ouml' => 214, 'times' => 215, 'Oslash' => 216,
    'Ugrave' => 217, 'Uacute' => 218, 'Ucirc' => 219, 'Uuml' => 220,
    'Yacute' => 221, 'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
    'aacute' => 225, 'acirc' => 226, 'atilde' => 227, 'auml' => 228,
    'aring' => 229, 'aelig' => 230, 'ccedil' => 231, 'egrave' => 232,
    'eacute' => 233, 'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
    'iacute' => 237, 'icirc' => 238, 'iuml' => 239, 'eth' => 240,
    'ntilde' => 241, 'ograve' => 242, 'oacute' => 243, 'ocirc' => 244,
    'otilde' => 245, 'ouml' => 246, 'divide' => 247, 'oslash' => 248,
    'ugrave' => 249, 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
    'yacute' => 253, 'thorn' => 254, 'yuml' => 255, 'quot' => 34,
    'amp' => 38, 'lt' => 60, 'gt' => 62, 'apos' => 39, 'OElig' => 338,
    'oelig' => 339, 'Scaron' => 352, 'scaron' => 353, 'Yuml' => 376,
    'circ' => 710, 'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
    'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205, 'lrm' => 8206,
    'rlm' => 8207, 'ndash' => 8211, 'mdash' => 8212, 'lsquo' => 8216,
    'rsquo' => 8217, 'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
    'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225, 'permil' => 8240,
    'lsaquo' => 8249, 'rsaquo' => 8250, 'euro' => 8364, 'fnof' => 402,
    'Alpha' => 913, 'Beta' => 914, 'Gamma' => 915, 'Delta' => 916,
    'Epsilon' => 917, 'Zeta' => 918, 'Eta' => 919, 'Theta' => 920,
    'Iota' => 921, 'Kappa' => 922, 'Lambda' => 923, 'Mu' => 924, 'Nu' => 925,
    'Xi' => 926, 'Omicron' => 927, 'Pi' => 928, 'Rho' => 929, 'Sigma' => 931,
    'Tau' => 932, 'Upsilon' => 933, 'Phi' => 934, 'Chi' => 935, 'Psi' => 936,
    'Omega' => 937, 'alpha' => 945, 'beta' => 946, 'gamma' => 947,
    'delta' => 948, 'epsilon' => 949, 'zeta' => 950, 'eta' => 951,
    'theta' => 952, 'iota' => 953, 'kappa' => 954, 'lambda' => 955,
    'mu' => 956, 'nu' => 957, 'xi' => 958, 'omicron' => 959, 'pi' => 960,
    'rho' => 961, 'sigmaf' => 962, 'sigma' => 963, 'tau' => 964,
    'upsilon' => 965, 'phi' => 966, 'chi' => 967, 'psi' => 968,
    'omega' => 969, 'thetasym' => 977, 'upsih' => 978, 'piv' => 982,
    'bull' => 8226, 'hellip' => 8230, 'prime' => 8242, 'Prime' => 8243,
    'oline' => 8254, 'frasl' => 8260, 'weierp' => 8472, 'image' => 8465,
    'real' => 8476, 'trade' => 8482, 'alefsym' => 8501, 'larr' => 8592,
    'uarr' => 8593, 'rarr' => 8594, 'darr' => 8595, 'harr' => 8596,
    'crarr' => 8629, 'lArr' => 8656, 'uArr' => 8657, 'rArr' => 8658,
    'dArr' => 8659, 'hArr' => 8660, 'forall' => 8704, 'part' => 8706,
    'exist' => 8707, 'empty' => 8709, 'nabla' => 8711, 'isin' => 8712,
    'notin' => 8713, 'ni' => 8715, 'prod' => 8719, 'sum' => 8721,
    'minus' => 8722, 'lowast' => 8727, 'radic' => 8730, 'prop' => 8733,
    'infin' => 8734, 'ang' => 8736, 'and' => 8743, 'or' => 8744, 'cap' => 8745,
    'cup' => 8746, 'int' => 8747, 'there4' => 8756, 'sim' => 8764,
    'cong' => 8773, 'asymp' => 8776, 'ne' => 8800, 'equiv' => 8801,
    'le' => 8804, 'ge' => 8805, 'sub' => 8834, 'sup' => 8835, 'nsub' => 8836,
    'sube' => 8838, 'supe' => 8839, 'oplus' => 8853, 'otimes' => 8855,
    'perp' => 8869, 'sdot' => 8901, 'lceil' => 8968, 'rceil' => 8969,
    'lfloor' => 8970, 'rfloor' => 8971, 'lang' => 9001, 'rang' => 9002,
    'loz' => 9674, 'spades' => 9824, 'clubs' => 9827, 'hearts' => 9829,
    'diams' => 9830
  );
}