diff options
Diffstat (limited to 'data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js')
-rw-r--r-- | data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js b/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js new file mode 100644 index 0000000..0a88504 --- /dev/null +++ b/data/extensions/uBlock0@raymondhill.net/js/regex-analyzer.js @@ -0,0 +1,256 @@ +/******************************************************************************* + + uBlock Origin - a comprehensive, efficient content blocker + Copyright (C) 2020-present Raymond Hill + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see {http://www.gnu.org/licenses/}. + + Home: https://github.com/gorhill/uBlock +*/ + +import Regex from '../lib/regexanalyzer/regex.js'; + +/******************************************************************************/ + +// Depends on: +// https://github.com/foo123/RegexAnalyzer +const RegexAnalyzer = Regex && Regex.Analyzer || null; + +export function isRE2(reStr) { + if ( RegexAnalyzer === null ) { return true; } + try { + return _isRE2(RegexAnalyzer(reStr, false).tree()); + } catch { + } + return false; +} + +export function tokenizableStrFromRegex(reStr) { + return _literalStrFromRegex(reStr); +} + +/******************************************************************************/ + +function _isRE2(node) { + if ( node instanceof Object === false ) { return true; } + if ( node.flags instanceof Object ) { + if ( node.flags.LookAhead === 1 ) { return false; } + if ( node.flags.NegativeLookAhead === 1 ) { return false; } + if ( node.flags.LookBehind === 1 ) { return false; } + if ( node.flags.NegativeLookBehind === 1 ) { return false; } + } + if ( Array.isArray(node.val) ) { + for ( const entry of node.val ) { + if ( _isRE2(entry) === false ) { return false; } + } + } + if ( node.val instanceof Object ) { + return _isRE2(node.val); + } + return true; +} + +/******************************************************************************/ + +function _literalStrFromRegex(reStr) { + if ( RegexAnalyzer === null ) { return ''; } + let s = ''; + try { + s = tokenizableStrFromNode( + RegexAnalyzer(reStr, false).tree() + ); + } catch { + } + // Process optional sequences + const reOptional = /[\x02\x03]+/; + for (;;) { + const match = reOptional.exec(s); + if ( match === null ) { break; } + const left = s.slice(0, match.index); + const middle = match[0]; + const right = s.slice(match.index + middle.length); + s = left; + s += firstCharCodeClass(right) === 1 || firstCharCodeClass(middle) === 1 + ? '\x01' + : '\x00'; + s += lastCharCodeClass(left) === 1 || lastCharCodeClass(middle) === 1 + ? '\x01' + : '\x00'; + s += right; + } + return s; +} + +function firstCharCodeClass(s) { + if ( s.length === 0 ) { return 0; } + const c = s.charCodeAt(0); + if ( c === 1 || c === 3 ) { return 1; } + return reCharCodeClass.test(s.charAt(0)) ? 1 : 0; +} + +function lastCharCodeClass(s) { + const i = s.length - 1; + if ( i === -1 ) { return 0; } + const c = s.charCodeAt(i); + if ( c === 1 || c === 3 ) { return 1; } + return reCharCodeClass.test(s.charAt(i)) ? 1 : 0; +} + +const reCharCodeClass = /[%0-9A-Za-z]/; + +function tokenizableStrFromNode(node) { + switch ( node.type ) { + case 1: /* T_SEQUENCE, 'Sequence' */ { + let s = ''; + for ( let i = 0; i < node.val.length; i++ ) { + s += tokenizableStrFromNode(node.val[i]); + } + return s; + } + case 2: /* T_ALTERNATION, 'Alternation' */ + case 8: /* T_CHARGROUP, 'CharacterGroup' */ { + if ( node.flags.NegativeMatch ) { return '\x01'; } + let firstChar = 0; + let lastChar = 0; + for ( let i = 0; i < node.val.length; i++ ) { + const s = tokenizableStrFromNode(node.val[i]); + if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) { + firstChar = 1; + } + if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) { + lastChar = 1; + } + if ( firstChar === 1 && lastChar === 1 ) { break; } + } + return String.fromCharCode(firstChar, lastChar); + } + case 4: /* T_GROUP, 'Group' */ { + if ( + node.flags.NegativeLookAhead === 1 || + node.flags.NegativeLookBehind === 1 + ) { + return ''; + } + return tokenizableStrFromNode(node.val); + } + case 16: /* T_QUANTIFIER, 'Quantifier' */ { + if ( node.flags.max === 0 ) { return ''; } + const s = tokenizableStrFromNode(node.val); + const first = firstCharCodeClass(s); + const last = lastCharCodeClass(s); + if ( node.flags.min !== 0 ) { + return String.fromCharCode(first, last); + } + return String.fromCharCode(first+2, last+2); + } + case 64: /* T_HEXCHAR, 'HexChar' */ { + if ( + node.flags.Code === '01' || + node.flags.Code === '02' || + node.flags.Code === '03' + ) { + return '\x00'; + } + return node.flags.Char; + } + case 128: /* T_SPECIAL, 'Special' */ { + const flags = node.flags; + if ( + flags.EndCharGroup === 1 || // dangling `]` + flags.EndGroup === 1 || // dangling `)` + flags.EndRepeats === 1 // dangling `}` + ) { + throw new Error('Unmatched bracket'); + } + return flags.MatchEnd === 1 || + flags.MatchStart === 1 || + flags.MatchWordBoundary === 1 + ? '\x00' + : '\x01'; + } + case 256: /* T_CHARS, 'Characters' */ { + for ( let i = 0; i < node.val.length; i++ ) { + if ( firstCharCodeClass(node.val[i]) === 1 ) { + return '\x01'; + } + } + return '\x00'; + } + // Ranges are assumed to always involve token-related characters. + case 512: /* T_CHARRANGE, 'CharacterRange' */ { + return '\x01'; + } + case 1024: /* T_STRING, 'String' */ { + return node.val; + } + case 2048: /* T_COMMENT, 'Comment' */ { + return ''; + } + default: + break; + } + return '\x01'; +} + +/******************************************************************************/ + +export function toHeaderPattern(reStr) { + if ( RegexAnalyzer === null ) { return; } + try { + return _toHeaderPattern(RegexAnalyzer(reStr, false).tree()); + } catch { + } +} + +function _toHeaderPattern(branch, depth = 0) { + switch ( branch.type ) { + case 1: /* T_SEQUENCE, 'Sequence' */ { + let s = ''; + for ( const node of branch.val ) { + const t = _toHeaderPattern(node, depth+1); + if ( t === undefined ) { return; } + s += t; + } + if ( depth === 0 && branch.val.length !== 0 ) { + const first = branch.val[0]; + if ( first.type !== 128 || first.val !== '^' ) { s = `*${s}`; } + const last = branch.val.at(-1); + if ( last.type !== 128 || last.val !== '$' ) { s = `${s}*`; } + } + return s; + } + case 4: /* T_GROUP, 'Group' */ { + if ( + branch.flags.NegativeLookAhead === 1 || + branch.flags.NegativeLookBehind === 1 + ) { + return; + } + return _toHeaderPattern(branch.val, depth+1); + } + case 64: /* T_HEXCHAR, 'HexChar' */ + return branch.flags.Char; + case 128: /* T_SPECIAL, 'Special' */ { + if ( branch.val === '^' ) { return ''; } + if ( branch.val === '$' ) { return ''; } + return; + } + case 1024: /* T_STRING, 'String' */ + return branch.val; + case 2048: /* T_COMMENT, 'Comment' */ + return ''; + default: + break; + } +} |