forked from bigmlcom/python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredicate.py
More file actions
215 lines (183 loc) · 7.57 KB
/
Copy pathpredicate.py
File metadata and controls
215 lines (183 loc) · 7.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding: utf-8 -*-
#!/usr/bin/env python
#
# Copyright 2013-2015 BigML
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Predicate structure for the BigML local Model
This module defines an auxiliary Predicate structure that is used in the Tree
to save the node's predicate info.
"""
import operator
import re
from bigml.util import plural
# Map operator str to its corresponding function
OPERATOR = {
"<": operator.lt,
"<=": operator.le,
"=": operator.eq,
"!=": operator.ne,
"/=": operator.ne,
">=": operator.ge,
">": operator.gt,
"in": operator.contains
}
TM_TOKENS = 'tokens_only'
TM_FULL_TERM = 'full_terms_only'
TM_ALL = 'all'
FULL_TERM_PATTERN = re.compile(r'^.+\b.+$', re.U)
RELATIONS = {
'<=': 'no more than %s %s',
'>=': '%s %s at most',
'>': 'more than %s %s',
'<': 'less than %s %s'
}
def term_matches(text, forms_list, options):
""" Counts the number of occurences of the words in forms_list in the text
The terms in forms_list can either be tokens or full terms. The
matching for tokens is contains and for full terms is equals.
"""
token_mode = options.get('token_mode', TM_TOKENS)
case_sensitive = options.get('case_sensitive', False)
first_term = forms_list[0]
if token_mode == TM_FULL_TERM:
return full_term_match(text, first_term, case_sensitive)
# In token_mode='all' we will match full terms using equals and
# tokens using contains
if token_mode == TM_ALL and len(forms_list) == 1:
if re.match(FULL_TERM_PATTERN, first_term):
return full_term_match(text, first_term, case_sensitive)
return term_matches_tokens(text, forms_list, case_sensitive)
def full_term_match(text, full_term, case_sensitive):
"""Counts the match for full terms according to the case_sensitive option
"""
if not case_sensitive:
text = text.lower()
full_term = full_term.lower()
return 1 if text == full_term else 0
def get_tokens_flags(case_sensitive):
"""Returns flags for regular expression matching depending on text analysis
options
"""
flags = re.U
if not case_sensitive:
flags = (re.I | flags)
return flags
def term_matches_tokens(text, forms_list, case_sensitive):
""" Counts the number of occurences of the words in forms_list in the text
"""
flags = get_tokens_flags(case_sensitive)
expression = ur'(\b|_)%s(\b|_)' % '(\\b|_)|(\\b|_)'.join(forms_list)
pattern = re.compile(expression, flags=flags)
matches = re.findall(pattern, text)
return len(matches)
class Predicate(object):
"""A predicate to be evaluated in a tree's node.
"""
def __init__(self, operation, field, value, term=None):
self.operator = operation
self.missing = False
if self.operator.endswith("*"):
self.operator = self.operator[0: -1]
self.missing = True
self.field = field
self.value = value
self.term = term
def is_full_term(self, fields):
"""Returns a boolean showing if a term is considered as a full_term
"""
if self.term is not None:
options = fields[self.field]['term_analysis']
token_mode = options.get('token_mode', TM_TOKENS)
if token_mode == TM_FULL_TERM:
return True
if token_mode == TM_ALL:
return re.match(FULL_TERM_PATTERN, self.term)
return False
def to_rule(self, fields, label='name'):
""" Builds rule string from a predicate
"""
name = fields[self.field][label]
full_term = self.is_full_term(fields)
relation_missing = u" or missing" if self.missing else u""
if self.term is not None:
relation_suffix = ''
if ((self.operator == '<' and self.value <= 1) or
(self.operator == '<=' and self.value == 0)):
relation_literal = (u'is not equal to' if full_term
else u'does not contain')
else:
relation_literal = u'is equal to' if full_term else u'contains'
if not full_term:
if self.operator != '>' or self.value != 0:
relation_suffix = (RELATIONS[self.operator] %
(self.value,
plural('time', self.value)))
return u"%s %s %s %s%s" % (name, relation_literal,
self.term, relation_suffix,
relation_missing)
if self.value is None:
return u"%s %s" % (name,
u"is None" if self.operator == '='
else u"is not None")
return u"%s %s %s%s" % (name,
self.operator,
self.value,
relation_missing)
def to_LISP_rule(self, fields):
""" Builds rule string in LISP from a predicate
"""
if self.term is not None:
options = fields[self.field]['term_analysis']
case_insensitive = not options.get('case_sensitive', False)
case_insensitive = u'true' if case_insensitive else u'false'
language = options.get('language')
language = u"" if language is None else u" %s" % language
return u"(%s (occurrences (f %s) %s %s%s) %s)" % (
self.operator, self.field, self.term,
case_insensitive, language, self.value)
if self.value is None:
negation = u"" if self.operator == "=" else u"not "
return u"(%s missing? %s)" % (negation, self.field)
rule = u"(%s (f %s) %s)" % (self.operator,
self.field,
self.value)
if self.missing:
rule = u"(or (missing? %s) %s)" % (self.field, rule)
return rule
def apply(self, input_data, fields):
""" Applies the operators defined in the predicate as strings to
the provided input data
"""
# for missing operators
if input_data.get(self.field) is None:
return self.missing or (
self.operator == '=' and self.value is None)
elif self.operator == '!=' and self.value is None:
return True
if self.term is not None:
all_forms = fields[self.field]['summary'].get('term_forms', {})
term_forms = all_forms.get(self.term, [])
terms = [self.term]
terms.extend(term_forms)
options = fields[self.field]['term_analysis']
return apply(OPERATOR[self.operator],
[term_matches(input_data[self.field], terms, options),
self.value])
if self.operator == "in":
return apply(OPERATOR[self.operator],
[self.value,
input_data[self.field]])
return apply(OPERATOR[self.operator],
[input_data[self.field],
self.value])