208 lines
5.3 KiB
Python
208 lines
5.3 KiB
Python
"""
|
|
ure - unicode re
|
|
|
|
A simple script that wraps the re interface with methods to handle unicode properties.
|
|
Patterns will all have re.UNICODE enabled and unicode property formats will be replaced
|
|
with the unicode characters in that category.
|
|
|
|
Example:
|
|
r"\p{Ll}\p{Lu}"
|
|
|
|
Licensed under MIT
|
|
Copyright (c) 2013 Isaac Muse <isaacmuse@gmail.com>
|
|
"""
|
|
import re
|
|
import sys
|
|
from os.path import dirname
|
|
try:
|
|
import unicodedata
|
|
except:
|
|
sys.path.append(dirname(sys.executable))
|
|
import unicodedata
|
|
|
|
PY3 = sys.version_info[0] >= 3
|
|
uchr = chr if PY3 else unichr
|
|
|
|
DEBUG = re.DEBUG
|
|
I = re.I
|
|
IGNORECASE = re.IGNORECASE
|
|
L = re.L
|
|
LOCALE = re.LOCALE
|
|
M = re.M
|
|
MULTILINE = re.MULTILINE
|
|
S = re.S
|
|
DOTALL = re.DOTALL
|
|
U = re.U
|
|
UNICODE = re.UNICODE
|
|
X = re.X
|
|
VERBOSE = re.VERBOSE
|
|
escape = re.escape
|
|
purge = re.purge
|
|
|
|
_unicode_properties = None
|
|
_unicode_key_pattern = None
|
|
|
|
|
|
def _build_unicode_property_table(unicode_range):
|
|
"""
|
|
Build property table for unicode range.
|
|
"""
|
|
table = {}
|
|
p = None
|
|
for i in range(*unicode_range):
|
|
try:
|
|
c = uchr(i)
|
|
p = unicodedata.category(c)
|
|
except:
|
|
continue
|
|
if p[0] not in table:
|
|
table[p[0]] = {}
|
|
if p[1] not in table[p[0]]:
|
|
table[p[0]][p[1]] = []
|
|
table[p[0]][p[1]].append(c)
|
|
|
|
# Join as one string
|
|
for k1, v1 in table.items():
|
|
for k2, v2 in v1.items():
|
|
v1[k2] = ''.join(v2)
|
|
|
|
return table
|
|
|
|
|
|
def _build_unicode_key_pattern():
|
|
"""
|
|
Build regex key pattern
|
|
"""
|
|
unicode_prop = r"\p\{(%s)\}"
|
|
unicode_keys = []
|
|
for k1, v1 in _unicode_properties.items():
|
|
unicode_keys.append("%s(?:%s)" % (k1, "|".join(v1.keys())))
|
|
return re.compile(unicode_prop % "|".join(unicode_keys), re.UNICODE)
|
|
|
|
|
|
def _init_unicode():
|
|
"""
|
|
Prepare unicode property tables and key pattern
|
|
"""
|
|
global _unicode_properties
|
|
global _unicode_key_pattern
|
|
_unicode_properties = _build_unicode_property_table((0x0000, 0x10FFFF))
|
|
_unicode_key_pattern = _build_unicode_key_pattern()
|
|
|
|
|
|
def find_char_groups(s):
|
|
"""
|
|
Find character groups
|
|
"""
|
|
pos = 0
|
|
groups = []
|
|
escaped = False
|
|
found = False
|
|
first = None
|
|
for c in s:
|
|
if c == "\\":
|
|
escaped = not escaped
|
|
elif escaped:
|
|
escaped = False
|
|
elif c == "[" and not found:
|
|
found = True
|
|
first = pos
|
|
elif c == "]" and found:
|
|
groups.append((first, pos))
|
|
pos += 1
|
|
return groups
|
|
|
|
|
|
def get_unicode_category(prop):
|
|
"""
|
|
Retrieve the unicode category from the table
|
|
"""
|
|
p1, p2 = (prop[0], prop[1]) if len(prop) > 1 else (prop[0], None)
|
|
return ''.join([x for x in _unicode_properties[p1].values()]) if p2 is None else _unicode_properties[p1][p2]
|
|
|
|
|
|
def parse_unicode_properties(re_pattern):
|
|
"""
|
|
Replaces regex property notation with unicode values
|
|
"""
|
|
char_groups = find_char_groups(re_pattern)
|
|
ure_pattern = re_pattern
|
|
for p in reversed(list(_unicode_key_pattern.finditer(re_pattern))):
|
|
v = get_unicode_category(p.group(1))
|
|
brackets = True
|
|
if v is None:
|
|
continue
|
|
for g in char_groups:
|
|
if p.start(0) >= g[0] and p.end(0) <= g[1]:
|
|
brackets = False
|
|
break
|
|
if brackets:
|
|
v = "[" + v + "]"
|
|
ure_pattern = ure_pattern[:p.start(0) - 1] + v + ure_pattern[p.end(0): len(ure_pattern)]
|
|
return ure_pattern
|
|
|
|
|
|
def compile(pattern, flags=0):
|
|
"""
|
|
compile after parsing unicode properties and set flag to unicode
|
|
"""
|
|
return re.compile(parse_unicode_properties(pattern), flags | re.UNICODE)
|
|
|
|
|
|
def search(pattern, string, flags=0):
|
|
"""
|
|
search after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.search(parse_unicode_properties(pattern), string, flags | re.UNICODE)
|
|
|
|
|
|
def match(pattern, string, flags=0):
|
|
"""
|
|
match after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.match(parse_unicode_properties(pattern), string, flags | re.UNICODE)
|
|
|
|
|
|
def split(pattern, string, maxsplit=0, flags=0):
|
|
"""
|
|
split after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.split(parse_unicode_properties(pattern), string, maxsplit, flags | re.UNICODE)
|
|
|
|
|
|
def findall(pattern, string, flags=0):
|
|
"""
|
|
findall after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.findall(parse_unicode_properties(pattern), string, flags | re.UNICODE)
|
|
|
|
|
|
def finditer(pattern, string, flags=0):
|
|
"""
|
|
finditer after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.finditer(parse_unicode_properties(pattern), string, flags | re.UNICODE)
|
|
|
|
|
|
def sub(pattern, repl, string, count=0, flags=0):
|
|
"""
|
|
sub after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.sub(parse_unicode_properties(pattern), repl, string, count, flags | re.UNICODE)
|
|
|
|
|
|
def subn(pattern, repl, string, count=0, flags=0):
|
|
"""
|
|
subn after parsing unicode properties and set flag to unicode
|
|
"""
|
|
re.subn(parse_unicode_properties(pattern), repl, string, flags | re.UNICODE)
|
|
|
|
|
|
_init_unicode()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Testing ure's unicode properties replacement")
|
|
print(parse_unicode_properties(r"[\p{Ll}\p{Lu}]"))
|
|
print(parse_unicode_properties(r"\p{Ll}\p{Lu}"))
|