AbstractLexer.php 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. <?php
  2. declare(strict_types=1);
  3. namespace Doctrine\Common\Lexer;
  4. use ReflectionClass;
  5. use function implode;
  6. use function in_array;
  7. use function preg_split;
  8. use function sprintf;
  9. use function substr;
  10. use const PREG_SPLIT_DELIM_CAPTURE;
  11. use const PREG_SPLIT_NO_EMPTY;
  12. use const PREG_SPLIT_OFFSET_CAPTURE;
  13. /**
  14. * Base class for writing simple lexers, i.e. for creating small DSLs.
  15. */
  16. abstract class AbstractLexer
  17. {
  18. /**
  19. * Lexer original input string.
  20. *
  21. * @var string
  22. */
  23. private $input;
  24. /**
  25. * Array of scanned tokens.
  26. *
  27. * Each token is an associative array containing three items:
  28. * - 'value' : the string value of the token in the input string
  29. * - 'type' : the type of the token (identifier, numeric, string, input
  30. * parameter, none)
  31. * - 'position' : the position of the token in the input string
  32. *
  33. * @var mixed[][]
  34. * @psalm-var list<array{value: string, type: string|int|null, position: int}>
  35. */
  36. private $tokens = [];
  37. /**
  38. * Current lexer position in input string.
  39. *
  40. * @var int
  41. */
  42. private $position = 0;
  43. /**
  44. * Current peek of current lexer position.
  45. *
  46. * @var int
  47. */
  48. private $peek = 0;
  49. /**
  50. * The next token in the input.
  51. *
  52. * @var mixed[]|null
  53. * @psalm-var array{value: string, type: string|int|null, position: int}|null
  54. */
  55. public $lookahead;
  56. /**
  57. * The last matched/seen token.
  58. *
  59. * @var mixed[]|null
  60. * @psalm-var array{value: string, type: string|int|null, position: int}|null
  61. */
  62. public $token;
  63. /**
  64. * Composed regex for input parsing.
  65. *
  66. * @var string|null
  67. */
  68. private $regex;
  69. /**
  70. * Sets the input data to be tokenized.
  71. *
  72. * The Lexer is immediately reset and the new input tokenized.
  73. * Any unprocessed tokens from any previous input are lost.
  74. *
  75. * @param string $input The input to be tokenized.
  76. *
  77. * @return void
  78. */
  79. public function setInput($input)
  80. {
  81. $this->input = $input;
  82. $this->tokens = [];
  83. $this->reset();
  84. $this->scan($input);
  85. }
  86. /**
  87. * Resets the lexer.
  88. *
  89. * @return void
  90. */
  91. public function reset()
  92. {
  93. $this->lookahead = null;
  94. $this->token = null;
  95. $this->peek = 0;
  96. $this->position = 0;
  97. }
  98. /**
  99. * Resets the peek pointer to 0.
  100. *
  101. * @return void
  102. */
  103. public function resetPeek()
  104. {
  105. $this->peek = 0;
  106. }
  107. /**
  108. * Resets the lexer position on the input to the given position.
  109. *
  110. * @param int $position Position to place the lexical scanner.
  111. *
  112. * @return void
  113. */
  114. public function resetPosition($position = 0)
  115. {
  116. $this->position = $position;
  117. }
  118. /**
  119. * Retrieve the original lexer's input until a given position.
  120. *
  121. * @param int $position
  122. *
  123. * @return string
  124. */
  125. public function getInputUntilPosition($position)
  126. {
  127. return substr($this->input, 0, $position);
  128. }
  129. /**
  130. * Checks whether a given token matches the current lookahead.
  131. *
  132. * @param int|string $token
  133. *
  134. * @return bool
  135. */
  136. public function isNextToken($token)
  137. {
  138. return $this->lookahead !== null && $this->lookahead['type'] === $token;
  139. }
  140. /**
  141. * Checks whether any of the given tokens matches the current lookahead.
  142. *
  143. * @param string[] $tokens
  144. *
  145. * @return bool
  146. */
  147. public function isNextTokenAny(array $tokens)
  148. {
  149. return $this->lookahead !== null && in_array($this->lookahead['type'], $tokens, true);
  150. }
  151. /**
  152. * Moves to the next token in the input string.
  153. *
  154. * @return bool
  155. */
  156. public function moveNext()
  157. {
  158. $this->peek = 0;
  159. $this->token = $this->lookahead;
  160. $this->lookahead = isset($this->tokens[$this->position])
  161. ? $this->tokens[$this->position++] : null;
  162. return $this->lookahead !== null;
  163. }
  164. /**
  165. * Tells the lexer to skip input tokens until it sees a token with the given value.
  166. *
  167. * @param string $type The token type to skip until.
  168. *
  169. * @return void
  170. */
  171. public function skipUntil($type)
  172. {
  173. while ($this->lookahead !== null && $this->lookahead['type'] !== $type) {
  174. $this->moveNext();
  175. }
  176. }
  177. /**
  178. * Checks if given value is identical to the given token.
  179. *
  180. * @param mixed $value
  181. * @param int|string $token
  182. *
  183. * @return bool
  184. */
  185. public function isA($value, $token)
  186. {
  187. return $this->getType($value) === $token;
  188. }
  189. /**
  190. * Moves the lookahead token forward.
  191. *
  192. * @return mixed[]|null The next token or NULL if there are no more tokens ahead.
  193. * @psalm-return array{value: string, type: string|int|null, position: int}|null
  194. */
  195. public function peek()
  196. {
  197. if (isset($this->tokens[$this->position + $this->peek])) {
  198. return $this->tokens[$this->position + $this->peek++];
  199. }
  200. return null;
  201. }
  202. /**
  203. * Peeks at the next token, returns it and immediately resets the peek.
  204. *
  205. * @return mixed[]|null The next token or NULL if there are no more tokens ahead.
  206. * @psalm-return array{value: string, type: string|int|null, position: int}|null
  207. */
  208. public function glimpse()
  209. {
  210. $peek = $this->peek();
  211. $this->peek = 0;
  212. return $peek;
  213. }
  214. /**
  215. * Scans the input string for tokens.
  216. *
  217. * @param string $input A query string.
  218. *
  219. * @return void
  220. */
  221. protected function scan($input)
  222. {
  223. if (! isset($this->regex)) {
  224. $this->regex = sprintf(
  225. '/(%s)|%s/%s',
  226. implode(')|(', $this->getCatchablePatterns()),
  227. implode('|', $this->getNonCatchablePatterns()),
  228. $this->getModifiers()
  229. );
  230. }
  231. $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
  232. $matches = preg_split($this->regex, $input, -1, $flags);
  233. if ($matches === false) {
  234. // Work around https://bugs.php.net/78122
  235. $matches = [[$input, 0]];
  236. }
  237. foreach ($matches as $match) {
  238. // Must remain before 'value' assignment since it can change content
  239. $type = $this->getType($match[0]);
  240. $this->tokens[] = [
  241. 'value' => $match[0],
  242. 'type' => $type,
  243. 'position' => $match[1],
  244. ];
  245. }
  246. }
  247. /**
  248. * Gets the literal for a given token.
  249. *
  250. * @param int|string $token
  251. *
  252. * @return int|string
  253. */
  254. public function getLiteral($token)
  255. {
  256. $className = static::class;
  257. $reflClass = new ReflectionClass($className);
  258. $constants = $reflClass->getConstants();
  259. foreach ($constants as $name => $value) {
  260. if ($value === $token) {
  261. return $className . '::' . $name;
  262. }
  263. }
  264. return $token;
  265. }
  266. /**
  267. * Regex modifiers
  268. *
  269. * @return string
  270. */
  271. protected function getModifiers()
  272. {
  273. return 'iu';
  274. }
  275. /**
  276. * Lexical catchable patterns.
  277. *
  278. * @return string[]
  279. */
  280. abstract protected function getCatchablePatterns();
  281. /**
  282. * Lexical non-catchable patterns.
  283. *
  284. * @return string[]
  285. */
  286. abstract protected function getNonCatchablePatterns();
  287. /**
  288. * Retrieve token type. Also processes the token value if necessary.
  289. *
  290. * @param string $value
  291. *
  292. * @return int|string|null
  293. */
  294. abstract protected function getType(&$value);
  295. }