-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathMissingRegExpAnchor.qll
More file actions
234 lines (217 loc) · 8.34 KB
/
Copy pathMissingRegExpAnchor.qll
File metadata and controls
234 lines (217 loc) · 8.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/**
* Provides predicates for reasoning about regular expressions
* without anchors.
*/
overlay[local?]
module;
private import RegexTreeView
import HostnameRegexp as HostnameShared
/**
* A signature specifying the required parts to perform an
* analysis on regular expressions without anchors.
*
* This analysis requires the hostname analysis to be available.
*/
signature module MissingRegExpAnchorSig<
RegexTreeViewSig TreeImpl, HostnameShared::HostnameRegexpSig<TreeImpl> Specific>
{
/**
* Holds if this regular expression is used in a 'replacement' operation, such
* as replacing all matches of the regular expression in the input string
* with another string.
*/
predicate isUsedAsReplace(Specific::RegExpPatternSource pattern);
/** Gets a string representation of an end anchor from a regular expression. */
string getEndAnchorText();
}
/**
* Classes and predicates implementing an analysis on regular expressions
* without anchors.
*/
module Make<
RegexTreeViewSig TreeImpl, HostnameShared::HostnameRegexpSig<TreeImpl> HostnameImpl,
MissingRegExpAnchorSig<TreeImpl, HostnameImpl> Impl>
{
private import TreeImpl
private import HostnameShared::Make<TreeImpl, HostnameImpl> as HostnameRegexp
private import HostnameImpl
private import Impl
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
*/
predicate isFinalRegExpTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpSequence seq |
isFinalRegExpTerm(seq) and
term = seq.getLastChild()
)
or
exists(RegExpTerm parent |
isFinalRegExpTerm(parent) and
term = parent.getAChild() and
not parent instanceof RegExpSequence and
not parent instanceof RegExpQuantifier
)
}
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not HostnameRegexp::isLeftArmTerm(term) and
not HostnameRegexp::isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `term` consists only of an anchor and a parenthesized term,
* such as the left side of `^(foo|bar)|baz`.
*
* The precedence of the anchor is likely to be intentional in this case,
* as the group wouldn't be needed otherwise.
*/
predicate isAnchoredGroup(RegExpSequence term) {
term.getNumChild() = 2 and
term.getAChild() instanceof RegExpAnchor and
term.getAChild() instanceof RegExpGroup
}
/**
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
*/
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
isAnchoredGroup(alt.getAChild()) and
not alt.getAChild() instanceof RegExpGroup
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
* precedence mistake explained by `msg`.
*
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
not hasExplicitAnchorPrecedence(root) and
containsLetters(anchoredTerm) and
(
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
direction = "beginning"
or
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
direction = "end"
) and
// is not used for replace
not isUsedAsReplace(src) and
msg =
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored at the " + direction +
", but the other parts of this regular expression are not"
)
}
/**
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
HostnameRegexp::hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
tld.getChild(0) instanceof RegExpCaret and
msg =
"This hostname pattern may match any domain name, as it is missing a '" + getEndAnchorText()
+ "' or '/' at the end."
)
}
/**
* Holds if `src` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
HostnameRegexp::alwaysMatchesHostname(term) and
tld = term.getAChild*() and
HostnameRegexp::hasTopLevelDomainEnding(tld) and
not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
not term.getAChild*() instanceof RegExpAnchor and
// that is not used for string replacement
not isUsedAsReplace(src) and
msg =
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
)
}
/**
* Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
* rather than `\A/\z` which match the start/end of the whole string.
*/
predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
// avoid double reporting
not (
isSemiAnchoredHostnameRegExp(src, _) or
hasMisleadingAnchorPrecedence(src, _)
) and
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not HostnameRegexp::isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
HostnameRegexp::hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
(
tld.getChild(0).(RegExpCaret).getChar() = "^" or
tld.getLastChild().(RegExpDollar).getChar() = "$"
) and
msg =
"This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
)
}
}