summaryrefslogtreecommitdiff
path: root/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js
diff options
context:
space:
mode:
Diffstat (limited to 'data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js')
-rw-r--r--data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js256
1 files changed, 256 insertions, 0 deletions
diff --git a/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js b/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js
new file mode 100644
index 0000000..0a88504
--- /dev/null
+++ b/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js
@@ -0,0 +1,256 @@
+/*******************************************************************************
+
+ uBlock Origin - a comprehensive, efficient content blocker
+ Copyright (C) 2020-present Raymond Hill
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see {http://www.gnu.org/licenses/}.
+
+ Home: https://github.com/gorhill/uBlock
+*/
+
+import Regex from '../lib/regexanalyzer/regex.js';
+
+/******************************************************************************/
+
+// Depends on:
+// https://github.com/foo123/RegexAnalyzer
+const RegexAnalyzer = Regex && Regex.Analyzer || null;
+
+export function isRE2(reStr) {
+ if ( RegexAnalyzer === null ) { return true; }
+ try {
+ return _isRE2(RegexAnalyzer(reStr, false).tree());
+ } catch {
+ }
+ return false;
+}
+
+export function tokenizableStrFromRegex(reStr) {
+ return _literalStrFromRegex(reStr);
+}
+
+/******************************************************************************/
+
+function _isRE2(node) {
+ if ( node instanceof Object === false ) { return true; }
+ if ( node.flags instanceof Object ) {
+ if ( node.flags.LookAhead === 1 ) { return false; }
+ if ( node.flags.NegativeLookAhead === 1 ) { return false; }
+ if ( node.flags.LookBehind === 1 ) { return false; }
+ if ( node.flags.NegativeLookBehind === 1 ) { return false; }
+ }
+ if ( Array.isArray(node.val) ) {
+ for ( const entry of node.val ) {
+ if ( _isRE2(entry) === false ) { return false; }
+ }
+ }
+ if ( node.val instanceof Object ) {
+ return _isRE2(node.val);
+ }
+ return true;
+}
+
+/******************************************************************************/
+
+function _literalStrFromRegex(reStr) {
+ if ( RegexAnalyzer === null ) { return ''; }
+ let s = '';
+ try {
+ s = tokenizableStrFromNode(
+ RegexAnalyzer(reStr, false).tree()
+ );
+ } catch {
+ }
+ // Process optional sequences
+ const reOptional = /[\x02\x03]+/;
+ for (;;) {
+ const match = reOptional.exec(s);
+ if ( match === null ) { break; }
+ const left = s.slice(0, match.index);
+ const middle = match[0];
+ const right = s.slice(match.index + middle.length);
+ s = left;
+ s += firstCharCodeClass(right) === 1 || firstCharCodeClass(middle) === 1
+ ? '\x01'
+ : '\x00';
+ s += lastCharCodeClass(left) === 1 || lastCharCodeClass(middle) === 1
+ ? '\x01'
+ : '\x00';
+ s += right;
+ }
+ return s;
+}
+
+function firstCharCodeClass(s) {
+ if ( s.length === 0 ) { return 0; }
+ const c = s.charCodeAt(0);
+ if ( c === 1 || c === 3 ) { return 1; }
+ return reCharCodeClass.test(s.charAt(0)) ? 1 : 0;
+}
+
+function lastCharCodeClass(s) {
+ const i = s.length - 1;
+ if ( i === -1 ) { return 0; }
+ const c = s.charCodeAt(i);
+ if ( c === 1 || c === 3 ) { return 1; }
+ return reCharCodeClass.test(s.charAt(i)) ? 1 : 0;
+}
+
+const reCharCodeClass = /[%0-9A-Za-z]/;
+
+function tokenizableStrFromNode(node) {
+ switch ( node.type ) {
+ case 1: /* T_SEQUENCE, 'Sequence' */ {
+ let s = '';
+ for ( let i = 0; i < node.val.length; i++ ) {
+ s += tokenizableStrFromNode(node.val[i]);
+ }
+ return s;
+ }
+ case 2: /* T_ALTERNATION, 'Alternation' */
+ case 8: /* T_CHARGROUP, 'CharacterGroup' */ {
+ if ( node.flags.NegativeMatch ) { return '\x01'; }
+ let firstChar = 0;
+ let lastChar = 0;
+ for ( let i = 0; i < node.val.length; i++ ) {
+ const s = tokenizableStrFromNode(node.val[i]);
+ if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) {
+ firstChar = 1;
+ }
+ if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) {
+ lastChar = 1;
+ }
+ if ( firstChar === 1 && lastChar === 1 ) { break; }
+ }
+ return String.fromCharCode(firstChar, lastChar);
+ }
+ case 4: /* T_GROUP, 'Group' */ {
+ if (
+ node.flags.NegativeLookAhead === 1 ||
+ node.flags.NegativeLookBehind === 1
+ ) {
+ return '';
+ }
+ return tokenizableStrFromNode(node.val);
+ }
+ case 16: /* T_QUANTIFIER, 'Quantifier' */ {
+ if ( node.flags.max === 0 ) { return ''; }
+ const s = tokenizableStrFromNode(node.val);
+ const first = firstCharCodeClass(s);
+ const last = lastCharCodeClass(s);
+ if ( node.flags.min !== 0 ) {
+ return String.fromCharCode(first, last);
+ }
+ return String.fromCharCode(first+2, last+2);
+ }
+ case 64: /* T_HEXCHAR, 'HexChar' */ {
+ if (
+ node.flags.Code === '01' ||
+ node.flags.Code === '02' ||
+ node.flags.Code === '03'
+ ) {
+ return '\x00';
+ }
+ return node.flags.Char;
+ }
+ case 128: /* T_SPECIAL, 'Special' */ {
+ const flags = node.flags;
+ if (
+ flags.EndCharGroup === 1 || // dangling `]`
+ flags.EndGroup === 1 || // dangling `)`
+ flags.EndRepeats === 1 // dangling `}`
+ ) {
+ throw new Error('Unmatched bracket');
+ }
+ return flags.MatchEnd === 1 ||
+ flags.MatchStart === 1 ||
+ flags.MatchWordBoundary === 1
+ ? '\x00'
+ : '\x01';
+ }
+ case 256: /* T_CHARS, 'Characters' */ {
+ for ( let i = 0; i < node.val.length; i++ ) {
+ if ( firstCharCodeClass(node.val[i]) === 1 ) {
+ return '\x01';
+ }
+ }
+ return '\x00';
+ }
+ // Ranges are assumed to always involve token-related characters.
+ case 512: /* T_CHARRANGE, 'CharacterRange' */ {
+ return '\x01';
+ }
+ case 1024: /* T_STRING, 'String' */ {
+ return node.val;
+ }
+ case 2048: /* T_COMMENT, 'Comment' */ {
+ return '';
+ }
+ default:
+ break;
+ }
+ return '\x01';
+}
+
+/******************************************************************************/
+
+export function toHeaderPattern(reStr) {
+ if ( RegexAnalyzer === null ) { return; }
+ try {
+ return _toHeaderPattern(RegexAnalyzer(reStr, false).tree());
+ } catch {
+ }
+}
+
+function _toHeaderPattern(branch, depth = 0) {
+ switch ( branch.type ) {
+ case 1: /* T_SEQUENCE, 'Sequence' */ {
+ let s = '';
+ for ( const node of branch.val ) {
+ const t = _toHeaderPattern(node, depth+1);
+ if ( t === undefined ) { return; }
+ s += t;
+ }
+ if ( depth === 0 && branch.val.length !== 0 ) {
+ const first = branch.val[0];
+ if ( first.type !== 128 || first.val !== '^' ) { s = `*${s}`; }
+ const last = branch.val.at(-1);
+ if ( last.type !== 128 || last.val !== '$' ) { s = `${s}*`; }
+ }
+ return s;
+ }
+ case 4: /* T_GROUP, 'Group' */ {
+ if (
+ branch.flags.NegativeLookAhead === 1 ||
+ branch.flags.NegativeLookBehind === 1
+ ) {
+ return;
+ }
+ return _toHeaderPattern(branch.val, depth+1);
+ }
+ case 64: /* T_HEXCHAR, 'HexChar' */
+ return branch.flags.Char;
+ case 128: /* T_SPECIAL, 'Special' */ {
+ if ( branch.val === '^' ) { return ''; }
+ if ( branch.val === '$' ) { return ''; }
+ return;
+ }
+ case 1024: /* T_STRING, 'String' */
+ return branch.val;
+ case 2048: /* T_COMMENT, 'Comment' */
+ return '';
+ default:
+ break;
+ }
+}