regex - Efficiently match multiple regexes in Python -
lexical analyzers quite easy write when have regexes. today wanted write simple general analyzer in python, , came with:
import re import sys class token(object): """ simple token structure. contains token type, value , position. """ def __init__(self, type, val, pos): self.type = type self.val = val self.pos = pos def __str__(self): return '%s(%s) @ %s' % (self.type, self.val, self.pos) class lexererror(exception): """ lexer error exception. pos: position in input line error occurred. """ def __init__(self, pos): self.pos = pos class lexer(object): """ simple regex-based lexer/tokenizer. see below example of usage. """ def __init__(self, rules, skip_whitespace=true): """ create lexer. rules: list of rules. each rule `regex, type` pair, `regex` regular expression used recognize token , `type` type of token return when it's recognized. skip_whitespace: if true, whitespace (\s+) skipped , not reported lexer. otherwise, have specify rules whitespace, or flagged error. """ self.rules = [] regex, type in rules: self.rules.append((re.compile(regex), type)) self.skip_whitespace = skip_whitespace self.re_ws_skip = re.compile('\s') def input(self, buf): """ initialize lexer buffer input. """ self.buf = buf self.pos = 0 def token(self): """ return next token (a token object) found in input buffer. none returned if end of buffer reached. in case of lexing error (the current chunk of buffer matches no rule), lexererror raised position of error. """ if self.pos >= len(self.buf): return none else: if self.skip_whitespace: m = self.re_ws_skip.search(self.buf[self.pos:]) if m: self.pos += m.start() else: return none token_regex, token_type in self.rules: m = token_regex.match(self.buf[self.pos:]) if m: value = self.buf[self.pos + m.start():self.pos + m.end()] tok = token(token_type, value, self.pos) self.pos += m.end() return tok # if we're here, no rule matched raise lexererror(self.pos) def tokens(self): """ returns iterator tokens found in buffer. """ while 1: tok = self.token() if tok none: break yield tok if __name__ == '__main__': rules = [ ('\d+', 'number'), ('[a-za-z_]\w+', 'identifier'), ('\+', 'plus'), ('\-', 'minus'), ('\*', 'multiply'), ('\/', 'divide'), ('\(', 'lp'), ('\)', 'rp'), ('=', 'equals'), ] lx = lexer(rules, skip_whitespace=true) lx.input('erw = _abc + 12*(r4-623902) ') try: tok in lx.tokens(): print tok except lexererror, err: print 'lexererror @ position', err.pos
it works fine, i'm bit worried it's inefficient. there regex tricks allow me write in more efficient / elegant way ?
specifically, there way avoid looping on regex rules linearly find 1 fits?
you can merge regexes 1 using "|" operator , let regex library work of discerning between tokens. care should taken ensure preference of tokens (for example avoid matching keyword identifier).
Comments
Post a Comment