File: //usr/bin/unicode
#! /usr/bin/python
from __future__ import unicode_literals
import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs
import webbrowser, textwrap, struct
from pprint import pprint
# bz2 was introduced in 2.3, but we want this to work even if for some
# reason it is not available
try:
import bz2
except ImportError:
bz2 = None
try:
import lzma
except ImportError:
lzma = None
def is_ascii(s):
"test is string s consists completely of ascii characters"
try:
s.encode('ascii')
except UnicodeEncodeError:
return False
return True
PY3 = sys.version_info[0] >= 3
if PY3:
import subprocess as cmd
from urllib.parse import quote as urlquote
import io
def out(*args):
"pring args, converting them to output charset"
for i in args:
sys.stdout.flush()
sys.stdout.buffer.write(i.encode(options.iocharset, 'replace'))
# ord23 is used to convert elements of byte array in python3, which are already integers
ord23 = lambda x: x
chr_orig = chr
else: # python2
# getoutput() and getstatusoutput() methods have
# been moved from commands to the subprocess module
# with Python >= 3.x
import commands as cmd
from urllib import quote as urlquote
def out(*args):
"pring args, converting them to output charset"
for i in args:
sys.stdout.write(i.encode(options.iocharset, 'replace'))
ord23 = ord
# python3-like chr
chr_orig = chr
chr = unichr
str = unicode
range = xrange
from optparse import OptionParser
VERSION='2.4'
# list of terminals that support bidi
biditerms = ['mlterm']
try:
locale.setlocale(locale.LC_ALL, '')
except locale.Error:
pass
# guess terminal charset
try:
iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"
except locale.Error:
iocharsetguess = "ascii"
if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'):
LTR = chr(0x202d) # left to right override
else:
LTR = ''
colours = {
'no_colour' : "",
'default' : "\033[0m",
'bold' : "\033[1m",
'underline' : "\033[4m",
'blink' : "\033[5m",
'reverse' : "\033[7m",
'concealed' : "\033[8m",
'black' : "\033[30m",
'red' : "\033[31m",
'green' : "\033[32m",
'yellow' : "\033[33m",
'blue' : "\033[34m",
'magenta' : "\033[35m",
'cyan' : "\033[36m",
'white' : "\033[37m",
'on_black' : "\033[40m",
'on_red' : "\033[41m",
'on_green' : "\033[42m",
'on_yellow' : "\033[43m",
'on_blue' : "\033[44m",
'on_magenta' : "\033[45m",
'on_cyan' : "\033[46m",
'on_white' : "\033[47m",
'beep' : "\007",
}
general_category = {
'Lu': 'Letter, Uppercase',
'Ll': 'Letter, Lowercase',
'Lt': 'Letter, Titlecase',
'Lm': 'Letter, Modifier',
'Lo': 'Letter, Other',
'Mn': 'Mark, Non-Spacing',
'Mc': 'Mark, Spacing Combining',
'Me': 'Mark, Enclosing',
'Nd': 'Number, Decimal Digit',
'Nl': 'Number, Letter',
'No': 'Number, Other',
'Pc': 'Punctuation, Connector',
'Pd': 'Punctuation, Dash',
'Ps': 'Punctuation, Open',
'Pe': 'Punctuation, Close',
'Pi': 'Punctuation, Initial quote',
'Pf': 'Punctuation, Final quote',
'Po': 'Punctuation, Other',
'Sm': 'Symbol, Math',
'Sc': 'Symbol, Currency',
'Sk': 'Symbol, Modifier',
'So': 'Symbol, Other',
'Zs': 'Separator, Space',
'Zl': 'Separator, Line',
'Zp': 'Separator, Paragraph',
'Cc': 'Other, Control',
'Cf': 'Other, Format',
'Cs': 'Other, Surrogate',
'Co': 'Other, Private Use',
'Cn': 'Other, Not Assigned',
}
bidi_category = {
'L' : 'Left-to-Right',
'LRE' : 'Left-to-Right Embedding',
'LRO' : 'Left-to-Right Override',
'R' : 'Right-to-Left',
'AL' : 'Right-to-Left Arabic',
'RLE' : 'Right-to-Left Embedding',
'RLO' : 'Right-to-Left Override',
'PDF' : 'Pop Directional Format',
'EN' : 'European Number',
'ES' : 'European Number Separator',
'ET' : 'European Number Terminator',
'AN' : 'Arabic Number',
'CS' : 'Common Number Separator',
'NSM' : 'Non-Spacing Mark',
'BN' : 'Boundary Neutral',
'B' : 'Paragraph Separator',
'S' : 'Segment Separator',
'WS' : 'Whitespace',
'ON' : 'Other Neutrals',
'LRI' : 'Left-to-Right Isolate',
'RLI' : 'Right-to-Left Isolate',
'FSI' : 'First Strong Isolate',
'PDI' : 'Pop Directional Isolate',
}
comb_classes = {
0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
1: 'Overlays and interior',
7: 'Nuktas',
8: 'Hiragana/Katakana voicing marks',
9: 'Viramas',
10: 'Start of fixed position classes',
199: 'End of fixed position classes',
200: 'Below left attached',
202: 'Below attached',
204: 'Below right attached',
208: 'Left attached (reordrant around single base character)',
210: 'Right attached',
212: 'Above left attached',
214: 'Above attached',
216: 'Above right attached',
218: 'Below left',
220: 'Below',
222: 'Below right',
224: 'Left (reordrant around single base character)',
226: 'Right',
228: 'Above left',
230: 'Above',
232: 'Above right',
233: 'Double below',
234: 'Double above',
240: 'Below (iota subscript)',
}
def get_unicode_blocks_descriptions():
"parses Blocks.txt"
unicodeblocks = {} # (low, high): 'desc'
f = None
for name in UnicodeBlocksFiles:
f = OpenGzip(name)
if f:
break
if not f:
return {}
for line in f:
if line.startswith('#') or ';' not in line or '..' not in line:
continue
ran, desc = line.split(';')
desc = desc.strip()
low, high = ran.split('..')
low = int(low, 16)
high = int(high, 16)
unicodeblocks[ (low,high) ] = desc
return unicodeblocks
unicodeblocks = None
def get_unicode_block(ch):
"return start_of_block, end_of_block, block_name"
global unicodeblocks
if unicodeblocks is None:
unicodeblocks = get_unicode_blocks_descriptions()
ch = ord(ch)
for low, high in unicodeblocks.keys():
if low<=ch<=high:
return low, high, unicodeblocks[ (low,high) ]
def get_unicode_properties(ch):
properties = {}
if ch in linecache:
fields = linecache[ch].strip().split(';')
proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase']
for i, prop in enumerate(proplist):
if prop!='dummy':
properties[prop] = fields[i]
if properties['lowercase']:
properties['lowercase'] = chr(int(properties['lowercase'], 16))
if properties['uppercase']:
properties['uppercase'] = chr(int(properties['uppercase'], 16))
if properties['titlecase']:
properties['titlecase'] = chr(int(properties['titlecase'], 16))
properties['combining'] = int(properties['combining'])
properties['mirrored'] = properties['mirrored']=='Y'
else:
properties['codepoint'] = '%04X' % ord(ch)
properties['name'] = unicodedata.name(ch, '')
properties['category'] = unicodedata.category(ch)
properties['combining'] = unicodedata.combining(ch)
properties['bidi'] = unicodedata.bidirectional(ch)
properties['decomposition'] = unicodedata.decomposition(ch)
properties['digit_value'] = str(unicodedata.digit(ch, ''))
properties['numeric_value'] = str(unicodedata.numeric(ch, ''))
properties['mirrored'] = unicodedata.mirrored(ch)
properties['unicode1name'] = ''
properties['iso_comment'] = ''
properties['uppercase'] = ch.upper() # this is not correct
properties['lowercase'] = ch.lower()
properties['titlecase'] = ''
return properties
def do_init():
HomeDir = os.path.expanduser('~/.unicode')
HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")
global UnicodeDataFileNames
UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unicode-data/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', '/usr/share/unicode/ucd/UnicodeData.txt', './UnicodeData.txt'] + \
glob.glob('/usr/share/unidata/UnicodeData*.txt') + \
glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \
glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX
HomeUnihanData = os.path.join(HomeDir, "Unihan*")
global UnihanDataGlobs
UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode-data/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*']
global UnicodeBlocksFiles
UnicodeBlocksFiles = ['/usr/share/unicode/Blocks.txt', '/usr/share/unicode-data/Blocks.txt', '/usr/share/unidata/Blocks.txt', './Blocks.txt']
# cache where grepped unicode properties are kept
global linecache
linecache = {}
def get_unihan_files():
fos = [] # list of file names for Unihan data file(s)
for gl in UnihanDataGlobs:
fnames = glob.glob(gl)
fos += fnames
return fos
def get_unihan_properties_internal(ch):
properties = {}
ch = ord(ch)
global unihan_fs
for f in unihan_fs:
fo = OpenGzip(f)
for l in fo:
if l.startswith('#'):
continue
line = l.strip()
if not line:
continue
char, key, value = line.strip().split('\t')
if int(char[2:], 16) == ch:
properties[key] = value.decode('utf-8')
elif int(char[2:], 16)>ch:
break
return properties
def get_unihan_properties_zgrep(ch):
properties = {}
global unihan_fs
ch = ord(ch)
chs = 'U+%X' % ch
for f in unihan_fs:
if f.endswith('.gz'):
grepcmd = 'zgrep'
elif f.endswith('.bz2'):
grepcmd = 'bzgrep'
elif f.endswith('.xz'):
grepcmd = 'xzgrep'
else:
grepcmd = 'grep'
cmdline = grepcmd+' ^'+chs+r'\\b '+f
status, output = cmd.getstatusoutput(cmdline)
if not PY3:
output = unicode(output, 'utf-8')
output = output.split('\n')
for l in output:
if not l:
continue
char, key, value = l.strip().split('\t')
if int(char[2:], 16) == ch:
properties[key] = value
elif int(char[2:], 16)>ch:
break
return properties
# basic sanity check, if e.g. you run this on MS Windows...
if os.path.exists('/bin/grep'):
get_unihan_properties = get_unihan_properties_zgrep
else:
get_unihan_properties = get_unihan_properties_internal
def error(txt):
out(txt)
out('\n')
sys.exit(1)
def get_gzip_filename(fname):
"return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None"
if os.path.exists(fname):
return fname
if os.path.exists(fname+'.gz'):
return fname+'.gz'
if os.path.exists(fname+'.bz2') and bz2 is not None:
return fname+'.bz2'
if os.path.exists(fname+'.xz') and lzma is not None:
return fname+'.xz'
return None
def OpenGzip(fname):
"open fname, try fname.gz or fname.bz2 or fname.xz if fname does not exist, return file object or GzipFile or BZ2File object"
fname = get_gzip_filename(fname)
fo = None
if not fname:
return None
if fname.endswith('.gz'):
fo = gzip.GzipFile(fname)
elif fname.endswith('.bz2'):
fo = bz2.BZ2File(fname)
elif fname.endswith('.xz'):
fo = lzma.open(fname)
else:
fo = io.open(fname, encoding='utf-8')
return fo
if fo:
# we cannot use TextIOWrapper, since it needs read1 method not implemented by gzip|bz2
fo = codecs.getreader('utf-8')(fo)
return fo
def GrepInNames(pattern, prefill_cache=False):
pat = re.compile(pattern, re.I)
f = None
for name in UnicodeDataFileNames:
f = OpenGzip(name)
if f != None:
break
if not f:
out( """
Cannot find UnicodeData.txt, please place it into
/usr/share/unidata/UnicodeData.txt,
/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current
working directory (optionally you can gzip it).
Without the file, searching will be much slower.
""" )
if prefill_cache:
if f:
for l in f:
if pat.search(l):
r = myunichr(int(l.split(';')[0], 16))
linecache[r] = l
f.close()
else:
if f:
for l in f:
if pat.search(l):
r = myunichr(int(l.split(';')[0], 16))
linecache[r] = l
yield r
f.close()
else:
for i in range(sys.maxunicode):
try:
name = unicodedata.name(chr(i))
if pat.search(name):
yield myunichr(i)
except ValueError:
pass
def valfromcp(n, cp=None):
"if cp is defined, then the 'n' is considered to be from that codepage and is converted accordingly"
"the output is a list of codepoints (integers)"
if cp:
xh = '%x' %n
if len(xh) % 2: # pad hexadecimal representation with a zero
xh = '0'+xh
cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] )
cps = ( int(i, 16) for i in cps)
# we have to use chr_orig (it's original chr for python2) and not 'B'
# because unicode_literals it will be unicode, which
# is not permitted in struct.pack in python2.6
cps = ( struct.pack(chr_orig(0x42),i) for i in cps ) # this works in both python3 and python2, unlike bytes([i])
cps = b''.join(cps)
cps = cps.decode(cp)
cps = [ord(x) for x in cps]
return cps
else:
return [n]
def myunichr(n):
try:
r = chr(n)
return r
except OverflowError:
traceback.print_exc()
error("The codepoint is too big - it does not fit into an int.")
except ValueError:
traceback.print_exc()
err = "The codepoint is too big."
if sys.maxunicode <= 0xffff:
err += "\nPerhaps your python interpreter is not compiled with wide unicode characters."
error(err)
def guesstype(arg):
if not arg: # empty string
return 'empty string', arg
elif not is_ascii(arg):
return 'string', arg
elif arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number
try:
val = int(arg[2:], 16)
if val>sys.maxunicode:
return 'regexp', arg
else:
return 'hexadecimal', arg[2:]
except ValueError:
return 'regexp', arg
elif arg[0] in "Uu" and len(arg)>4:
try:
val = int(arg[1:], 16)
if val>sys.maxunicode:
return 'regexp', arg
else:
return 'hexadecimal', arg
except ValueError:
return 'regexp', arg
elif len(arg)>=4:
if len(arg) in (8, 16, 24, 32):
if all(x in '01' for x in arg):
val = int(arg, 2)
if val<=sys.maxunicode:
return 'binary', arg
try:
val = int(arg, 16)
if val>sys.maxunicode:
return 'regexp', arg
else:
return 'hexadecimal', arg
except ValueError:
return 'regexp', arg
else:
return 'string', arg
def process(arglist, t, fromcp=None, prefill_cache=False):
# build a list of values, so that we can combine queries like
# LATIN ALPHA and search for LATIN.*ALPHA and not names that
# contain either LATIN or ALPHA
result = []
names_query = [] # reserved for queries in names - i.e. -r
for arg_i in arglist:
if t==None:
tp, arg = guesstype(arg_i)
if tp == 'regexp':
# if the first argument is guessed to be a regexp, add
# all the following arguments to the regular expression -
# this is probably what you wanted, e.g.
# 'unicode cyrillic be' will now search for the 'cyrillic.*be' regular expression
t = 'regexp'
else:
tp, arg = t, arg_i
if tp=='hexadecimal':
val = int(arg, 16)
vals = valfromcp(val, fromcp)
for val in vals:
r = myunichr(val)
result.append(r)
elif tp=='decimal':
val = int(arg, 10)
vals = valfromcp(val, fromcp)
for val in vals:
r = myunichr(val)
result.append(r)
elif tp=='octal':
val = int(arg, 8)
vals = valfromcp(val, fromcp)
for val in vals:
r = myunichr(val)
result.append(r)
elif tp=='binary':
val = int(arg, 2)
vals = valfromcp(val, fromcp)
for val in vals:
r = myunichr(val)
result.append(r)
elif tp=='regexp':
names_query.append(arg)
elif tp=='string':
unirepr = arg
for r in unirepr:
result.append(r)
elif tp=='empty string':
pass # do not do anything for an empty string
if result and prefill_cache:
hx = '|'.join('%04X'%ord(x) for x in result)
list(GrepInNames(hx, prefill_cache=True))
if names_query:
query = '.*'.join(names_query)
for r in GrepInNames(query):
result.append(r)
return result
def maybe_colours(colour):
if options.use_colour:
return colours[colour]
else:
return ""
# format key and value
def printkv(*l):
for i in range(0, len(l), 2):
if i<len(l)-2:
sep = " "
else:
sep = "\n"
k, v = l[i], l[i+1]
out(maybe_colours('green'))
out(k)
out(": ")
out(maybe_colours('default'))
out(str(v))
out(sep)
def print_characters(clist, maxcount, format_string, query_wikipedia=0, query_wiktionary=0):
"""query_wikipedia or query_wiktionary:
0 - don't
1 - spawn browser
"""
counter = 0
for c in clist:
if query_wikipedia or query_wiktionary:
ch = urlquote(c.encode('utf-8')) # wikipedia uses UTF-8 in names
wiki_base_url = 'http://en.wikipedia.org/wiki/'
if query_wiktionary:
wiki_base_url = 'http://en.wiktionary.org/wiki/'
wiki_url = wiki_base_url+ch
webbrowser.open(wiki_url)
query_wikipedia = query_wiktionary = 0 # query only the very first character
if maxcount:
counter += 1
if counter > options.maxcount:
out("\nToo many characters to display, more than %s, use --max option to change it\n" % options.maxcount)
return
for colour_key in colours.keys():
locals()[colour_key] = maybe_colours(colour_key)
properties = get_unicode_properties(c)
ordc = ord(c)
if properties['name']:
name = properties['name']
else:
name = " - No such unicode character name in database"
utf8 = ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')])
utf16be = ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')])
decimal = "&#%s;" % ordc
octal = "\\0%o" % ordc
addcharset = options.addcharset
if addcharset:
try:
in_additional_charset = ' '.join([("%02x" % ord23(x)) for x in c.encode(addcharset)] )
except UnicodeError:
in_additional_charset = "NONE"
if properties['combining']:
pchar = " "+c
else:
pchar = c
uppercase = properties['uppercase']
lowercase = properties['lowercase']
opt_uppercase = opt_lowercase = ''
flipcase = None
if uppercase:
ord_uppercase = ord(properties['uppercase'])
opt_uppercase = '\n{green}Uppercase:{default} {ord_uppercase:04X}'.format(**locals())
flipcase = uppercase
elif lowercase:
ord_lowercase = ord(properties['lowercase'])
opt_lowercase = '\n{green}Lowercase:{default} {ord_lowercase:04X}'.format(**locals())
flipcase = lowercase
category = properties['category']
category_desc = general_category[category]
opt_numeric = ''
numeric_desc = ''
if properties['numeric_value']:
opt_numeric = 'Numeric value: '
numeric_desc = properties['numeric_value']+'\n'
opt_digit = ''
digit_desc = ''
if properties['digit_value']:
opt_digit = 'Digit value: '
digit_desc = properties['digit_value']+'\n'
opt_bidi = ''
bidi_desc = ''
bidi = properties['bidi']
bidi_desc = bidi_category.get(bidi, bidi)
if bidi:
opt_bidi = 'Bidi: '
bidi_desc = ' ({0})'.format(bidi_desc)
mirrored_desc = ''
mirrored = properties['mirrored']
if mirrored:
mirrored_desc = 'Character is mirrored\n'
opt_combining = ''
comb = properties['combining']
combining_desc = ''
if comb:
opt_combining = 'Combining: '
combining_desc = "{comb} ({comb_class})\n".format(comb=comb, comb_class=comb_classes.get(comb, '?'))
opt_decomp = ''
decomp_desc = ''
decomp = properties['decomposition']
if decomp:
opt_decomp = 'Decomposition: '
decomp_desc = decomp+'\n'
opt_unicode_block = ''
opt_unicode_block_desc = ''
unicode_block = get_unicode_block(c)
if unicode_block:
low, high, desc = unicode_block
opt_unicode_block = 'Unicode block: '
opt_unicode_block_desc = "{low:04X}..{high:04X}; {desc}\n".format(low=low,high=high,desc=desc)
if addcharset:
opt_additional = ' {green}{addcharset}:{default} {in_additional_charset}'.format(**locals())
else:
opt_additional = ''
if flipcase:
opt_flipcase = ' ({flipcase})'.format(**locals())
else:
opt_flipcase = ''
formatted_output = format_string.format(**locals())
out(formatted_output)
if options.verbosity>0:
uhp = get_unihan_properties(c)
for key in uhp:
printkv(key, uhp[key])
def print_block(block):
#header
out(" "*10)
for i in range(16):
out(".%X " % i)
out('\n')
#body
for i in range(block*16, block*16+16):
hexi = "%X" % i
if len(hexi)>3:
hexi = "%07X" % i
hexi = hexi[:4]+" "+hexi[4:]
else:
hexi = " %03X" % i
out(LTR+hexi+". ")
for j in range(16):
c = chr(i*16+j)
c_out = c
if unicodedata.combining(c):
c_out = " "+c
# fallback for python without east_asian_width (probably unnecessary, since this script does not work with <2.6 anyway)
fullwidth = 'east_asian_width' in unicodedata.__dict__ and unicodedata.east_asian_width(c)[0] in 'FW'
if not fullwidth:
c_out = ' '+c_out
out(c_out)
out(' ')
out('\n')
out('\n')
def print_blocks(blocks):
for block in blocks:
print_block(block)
def is_range(s, typ):
sp = s.split('..')
if len(sp)!=2:
return False
if not sp[1]:
sp[1] = sp[0]
elif not sp[0]:
sp[0] = sp[1]
if not sp[0]:
return False
low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters
high = list(process([sp[1]], typ))
if len(low)!=1 or len(high)!=1:
return False
low = ord(low[0])
high = ord(high[0])
low = low // 256
high = high // 256 + 1
return range(low, high)
def unescape(s):
return s.replace(r'\n', '\n')
format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default}
{green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional}
{pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase}
{green}Category:{default} {category} ({category_desc})
{green}{opt_unicode_block}{default}{opt_unicode_block_desc}{opt_numeric}{default}{numeric_desc}{green}{opt_digit}{default}{digit_desc}{green}{opt_bidi}{default}{bidi}{bidi_desc}
{mirrored_desc}{green}{opt_combining}{default}{combining_desc}{green}{opt_decomp}{default}{decomp_desc}
'''
def main():
parser = OptionParser(usage="usage: %prog [options] arg")
parser.add_option("-x", "--hexadecimal",
action="store_const", const='hexadecimal', dest="type",
help="Assume arg to be hexadecimal number")
parser.add_option("-o", "--octal",
action="store_const", const='octal', dest="type",
help="Assume arg to be octal number")
parser.add_option("-b", "--binary",
action="store_const", const='binary', dest="type",
help="Assume arg to be binary number")
parser.add_option("-d", "--decimal",
action="store_const", const='decimal', dest="type",
help="Assume arg to be decimal number")
parser.add_option("-r", "--regexp",
action="store_const", const='regexp', dest="type",
help="Assume arg to be regular expression")
parser.add_option("-s", "--string",
action="store_const", const='string', dest="type",
help="Assume arg to be a sequence of characters")
parser.add_option("-a", "--auto",
action="store_const", const=None, dest="type",
help="Try to guess arg type (default)")
parser.add_option("-m", "--max",
action="store", default=10, dest="maxcount", type="int",
help="Maximal number of codepoints to display, default: 10; 0=unlimited")
parser.add_option("-i", "--io",
action="store", default=iocharsetguess, dest="iocharset", type="string",
help="I/O character set, I am guessing %s" % iocharsetguess)
parser.add_option("--fcp", "--fromcp",
action="store", default='', dest="fromcp", type="string",
help="Convert numerical arguments from this encoding, default: no conversion")
parser.add_option("-c", "--charset-add",
action="store", dest="addcharset", type="string",
help="Show hexadecimal reprezentation in this additional charset")
parser.add_option("-C", "--colour",
action="store", dest="use_colour", type="string",
default="auto",
help="Use colours, on, off or auto")
parser.add_option('', "--color",
action="store", dest="use_colour", type="string",
default="auto",
help="synonym for --colour")
parser.add_option("-v", "--verbose",
action="count", dest="verbosity",
default=0,
help="Increase verbosity (reads Unihan properties - slow!)")
parser.add_option("-w", "--wikipedia",
action="count", dest="query_wikipedia",
default=0,
help="Query wikipedia for the character")
parser.add_option("--wt", "--wiktionary",
action="count", dest="query_wiktionary",
default=0,
help="Query wiktionary for the character")
parser.add_option("--list",
action="store_const", dest="list_all_encodings",
const=True,
help="List (approximately) all known encodings")
parser.add_option("--format",
action="store", dest="format_string", type="string",
default=format_string_default,
help="formatting string")
parser.add_option("--brief", "--terse",
action="store_const", dest="format_string",
const='{pchar} U+{ordc:04X} {name}\n',
help="Brief format")
global options
(options, arguments) = parser.parse_args()
format_string = unescape(options.format_string)
do_init()
if options.list_all_encodings:
all_encodings = os.listdir(os.path.dirname(encodings.__file__))
all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])
all_encodings = list(all_encodings)
all_encodings.sort()
print (textwrap.fill(' '.join(all_encodings)))
sys.exit()
if len(arguments)==0:
parser.print_help()
sys.exit()
if options.use_colour.lower() in ("on", "1", "true", "yes"):
# we reuse the options.use_colour, so that we do not need to use another global
options.use_colour = True
elif options.use_colour.lower() in ("off", "0", "false", "no"):
options.use_colour = False
else:
options.use_colour = sys.stdout.isatty()
if sys.platform == 'win32':
options.use_colour = False
l_args = [] # list of non range arguments to process
for argum in arguments:
if PY3:
# in python3, argv is automatically decoded into unicode
# but we have to check for surrogates
argum = argum.encode(options.iocharset, 'surrogateescape')
try:
argum = argum.decode(options.iocharset)
except UnicodeDecodeError:
error ("Sequence %s is not valid in charset '%s'." % (repr(argum), options.iocharset))
is_r = is_range(argum, options.type)
if is_r:
print_blocks(is_r)
else:
l_args.append(argum)
if l_args:
global unihan_fs
unihan_fs = []
if options.verbosity>0:
unihan_fs = get_unihan_files() # list of file names for Unihan data file(s), empty if not available
if not unihan_fs:
out( """
Unihan_*.txt files not found. In order to view Unihan properties,
please place the files into /usr/share/unidata/,
/usr/share/unicode/, ~/.unicode/
or current working directory (optionally you can gzip or bzip2 them).
You can get the files by unpacking ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
Warning, listing UniHan Properties is rather slow.
""")
options.verbosity = 0
processed_args = process(l_args, options.type, options.fromcp, prefill_cache=True)
print_characters(processed_args, options.maxcount, format_string, options.query_wikipedia, options.query_wiktionary)
if __name__ == '__main__':
main()