-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrailwaytokeniser.py
More file actions
114 lines (95 loc) · 3.78 KB
/
Copy pathrailwaytokeniser.py
File metadata and controls
114 lines (95 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from collections import namedtuple
import re
class RailwayLexingError(RuntimeError):
def __init__(self, line, col):
self.line, self.col = line, col
name_regex = re.compile(r'[a-zA-Z_][a-zA-Z0-9_.]*')
number_regex = re.compile('\d+(\/\d+)?')
string_regex = re.compile('("[^"]*")|(\'[^\']*\')')
escaped_newline_regex = re.compile('\\\\[ \t\r\f\v]*\n')
ignore_regex = re.compile('([$][^$]*[$])|([ \t\r\f\v]+)')
symbols = {
'import', 'as', 'global', 'let', 'unlet', 'func', 'return', 'println',
'print', 'if', 'fi', 'else', 'loop', 'pool', 'for', 'rof', 'call', 'uncall',
'do', 'undo', 'yield', 'swap', 'push', 'pop', 'try', 'catch', 'yrt',
'promote', 'in', 'to', 'by', 'tensor', 'barrier', 'mutex', 'xetum',
'TID', '#TID',
'<=>', '<=', '=>', '>=', '!=', '==',
'//=', '**=', '+=', '-=', '*=', '/=', '%=', '^=', '|=', '&=',
'//', '**', '<', '>', '=', '+', '-', '*', '/', '%', '^', '|', '&',
'(', ')', '[', ']', '{', '}', ',', '.', '#', '!'
}
max_symbol_length = max(len(s) for s in symbols)
DefaultToken = namedtuple('Token', ['type', 'string', 'line', 'col'])
def tokenise(data, TokenClass=DefaultToken):
line, col = 1, 0
pos = 0
skip_newline = True
len_data = len(data)
while pos < len_data:
if data[pos] == '\n':
if not skip_newline:
yield TokenClass('NEWLINE', '\n', line, col)
skip_newline = True
line += 1
col = 0
pos += 1
continue
for sym_length in range(min(max_symbol_length, len_data - pos), 0, -1):
if data[pos:pos + sym_length] in symbols:
endpos = pos + sym_length
string = data[pos:endpos]
yield TokenClass(string, string, line, col)
skip_newline = False
col += sym_length
pos = endpos
break
else:
name_match = name_regex.match(data, pos)
if name_match:
endpos = name_match.span()[1]
string = data[pos:endpos]
yield TokenClass('NAME', string, line, col)
skip_newline = False
col += endpos - pos
pos = endpos
continue
number_match = number_regex.match(data, pos)
if number_match:
endpos = number_match.span()[1]
string = data[pos:endpos]
yield TokenClass('NUMBER', string, line, col)
skip_newline = False
col += endpos - pos
pos = endpos
continue
string_match = string_regex.match(data, pos)
if string_match:
endpos = string_match.span()[1]
string = data[pos+1:endpos-1]
yield TokenClass('STRING', string, line, col)
skip_newline = False
col += endpos - pos
pos = endpos
continue
ignore_match = ignore_regex.match(data, pos)
if ignore_match:
endpos = ignore_match.span()[1]
line += data[pos:endpos].count('\n')
col += endpos - pos
pos = endpos
continue
escaped_newline_match = escaped_newline_regex.match(data, pos)
if escaped_newline_match:
line += 1
col = 0
pos = escaped_newline_match.span()[1]
continue
raise RailwayLexingError(line, col)
if not skip_newline:
yield TokenClass('NEWLINE', '\n', line, col)
yield TokenClass('ENDMARKER', '', line, col)
if __name__ == '__main__':
with open('tmp.rail') as f:
for token in tokenise(f.read()):
print(f'{repr(token.string):12s}: {token.type}')