Incorporate KindleUnpack from https://github.com/kevinhendricks/KindleUnpack
The GUI elements have been removed
This commit is contained in:
278
KindleUnpack/compatibility_utils.py
Executable file
278
KindleUnpack/compatibility_utils.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
||||
# of conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||||
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
PY2 = sys.version_info[0] == 2
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
iswindows = sys.platform.startswith('win')
|
||||
|
||||
try:
|
||||
from urllib.parse import unquote
|
||||
except ImportError:
|
||||
from urllib import unquote
|
||||
|
||||
if PY2:
|
||||
from HTMLParser import HTMLParser
|
||||
_h = HTMLParser()
|
||||
elif sys.version_info[1] < 4:
|
||||
import html.parser
|
||||
_h = html.parser.HTMLParser()
|
||||
else:
|
||||
import html as _h
|
||||
|
||||
if PY3:
|
||||
text_type = str
|
||||
binary_type = bytes
|
||||
# if will be printing arbitraty binary data to stdout on python 3
|
||||
# sys.stdin = sys.stdin.detach()
|
||||
# sys.stdout = sys.stdout.detach()
|
||||
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
||||
else:
|
||||
range = xrange
|
||||
text_type = unicode
|
||||
binary_type = str
|
||||
# if will be printing unicode under python 2 need to protect
|
||||
# against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
|
||||
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
|
||||
# alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
|
||||
|
||||
# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
|
||||
# (and they amazingly claim by design and no bug!)
|
||||
|
||||
# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
|
||||
# >>> o = '123456789'
|
||||
# >>> o[-3]
|
||||
# '7'
|
||||
# >>> type(o[-3])
|
||||
# <class 'str'>
|
||||
# >>> type(o)
|
||||
# <class 'str'>
|
||||
|
||||
# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
|
||||
# >>> o = b'123456789'
|
||||
# >>> o[-3]
|
||||
# 55
|
||||
# >>> type(o[-3])
|
||||
# <class 'int'>
|
||||
# >>> type(o)
|
||||
# <class 'bytes'>
|
||||
|
||||
# This mind boggling behaviour also happens when indexing a bytestring and/or
|
||||
# iteratoring over a bytestring. In other words it will return an int but not
|
||||
# the byte itself!!!!!!!
|
||||
|
||||
# The only way to access a single byte as a byte in bytestring and get the byte in both
|
||||
# Python 2 and Python 3 is to use a slice
|
||||
|
||||
# This problem is so common there are horrible hacks floating around the net to **try**
|
||||
# to work around it, so that code that works on both Python 2 and Python 3 is possible.
|
||||
|
||||
# So in order to write code that works on both Python 2 and Python 3
|
||||
# if you index or access a single byte and want its ord() then use the bord() function.
|
||||
# If instead you want it as a single character byte use the bchar() function
|
||||
# both of which are defined below.
|
||||
|
||||
if PY3:
|
||||
# Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
|
||||
# in place of ascii you will get a byte value to half-word or integer value
|
||||
# one-to-one mapping (in the 0 - 255 range)
|
||||
|
||||
def bchr(s):
|
||||
return bytes([s])
|
||||
|
||||
def bstr(s):
|
||||
if isinstance(s, str):
|
||||
return bytes(s, 'latin-1')
|
||||
else:
|
||||
return bytes(s)
|
||||
|
||||
def bord(s):
|
||||
return s
|
||||
|
||||
def bchar(s):
|
||||
return bytes([s])
|
||||
|
||||
else:
|
||||
def bchr(s):
|
||||
return chr(s)
|
||||
|
||||
def bstr(s):
|
||||
return str(s)
|
||||
|
||||
def bord(s):
|
||||
return ord(s)
|
||||
|
||||
def bchar(s):
|
||||
return s
|
||||
|
||||
if PY3:
|
||||
# list-producing versions of the major Python iterating functions
|
||||
def lrange(*args, **kwargs):
|
||||
return list(range(*args, **kwargs))
|
||||
|
||||
def lzip(*args, **kwargs):
|
||||
return list(zip(*args, **kwargs))
|
||||
|
||||
def lmap(*args, **kwargs):
|
||||
return list(map(*args, **kwargs))
|
||||
|
||||
def lfilter(*args, **kwargs):
|
||||
return list(filter(*args, **kwargs))
|
||||
else:
|
||||
import __builtin__
|
||||
# Python 2-builtin ranges produce lists
|
||||
lrange = __builtin__.range
|
||||
lzip = __builtin__.zip
|
||||
lmap = __builtin__.map
|
||||
lfilter = __builtin__.filter
|
||||
|
||||
# In Python 3 you can no longer use .encode('hex') on a bytestring
|
||||
# instead use the following on both platforms
|
||||
import binascii
|
||||
def hexlify(bdata):
|
||||
return (binascii.hexlify(bdata)).decode('ascii')
|
||||
|
||||
# If you: import struct
|
||||
# Note: struct pack, unpack, unpack_from all *require* bytestring format
|
||||
# data all the way up to at least Python 2.7.5, Python 3 is okay with either
|
||||
|
||||
# If you: import re
|
||||
# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
|
||||
# searched ... but u"" is not allowed for the pattern itself only b""
|
||||
# Python 2.X allows the pattern to be any type and converts it to match the data
|
||||
# and returns the same type as the data
|
||||
|
||||
# convert string to be utf-8 encoded
|
||||
def utf8_str(p, enc='utf-8'):
|
||||
if p is None:
|
||||
return None
|
||||
if isinstance(p, text_type):
|
||||
return p.encode('utf-8')
|
||||
if enc != 'utf-8':
|
||||
return p.decode(enc).encode('utf-8')
|
||||
return p
|
||||
|
||||
# convert string to be unicode encoded
|
||||
def unicode_str(p, enc='utf-8'):
|
||||
if p is None:
|
||||
return None
|
||||
if isinstance(p, text_type):
|
||||
return p
|
||||
return p.decode(enc)
|
||||
|
||||
ASCII_CHARS = set(chr(x) for x in range(128))
|
||||
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'abcdefghijklmnopqrstuvwxyz'
|
||||
'0123456789' '#' '_.-/~')
|
||||
IRI_UNSAFE = ASCII_CHARS - URL_SAFE
|
||||
|
||||
# returns a quoted IRI (not a URI)
|
||||
def quoteurl(href):
|
||||
if isinstance(href,binary_type):
|
||||
href = href.decode('utf-8')
|
||||
result = []
|
||||
for char in href:
|
||||
if char in IRI_UNSAFE:
|
||||
char = "%%%02x" % ord(char)
|
||||
result.append(char)
|
||||
return ''.join(result)
|
||||
|
||||
# unquotes url/iri
|
||||
def unquoteurl(href):
|
||||
if isinstance(href,binary_type):
|
||||
href = href.decode('utf-8')
|
||||
href = unquote(href)
|
||||
return href
|
||||
|
||||
# unescape html
|
||||
def unescapeit(sval):
|
||||
return _h.unescape(sval)
|
||||
|
||||
# Python 2.X commandline parsing under Windows has been horribly broken for years!
|
||||
# Use the following code to emulate full unicode commandline parsing on Python 2
|
||||
# ie. To get sys.argv arguments and properly encode them as unicode
|
||||
|
||||
def unicode_argv():
|
||||
global iswindows
|
||||
global PY3
|
||||
if PY3:
|
||||
return sys.argv
|
||||
if iswindows:
|
||||
# Versions 2.x of Python don't support Unicode in sys.argv on
|
||||
# Windows, with the underlying Windows API instead replacing multi-byte
|
||||
# characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
|
||||
# as a list of Unicode strings
|
||||
from ctypes import POINTER, byref, cdll, c_int, windll
|
||||
from ctypes.wintypes import LPCWSTR, LPWSTR
|
||||
|
||||
GetCommandLineW = cdll.kernel32.GetCommandLineW
|
||||
GetCommandLineW.argtypes = []
|
||||
GetCommandLineW.restype = LPCWSTR
|
||||
|
||||
CommandLineToArgvW = windll.shell32.CommandLineToArgvW
|
||||
CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
|
||||
CommandLineToArgvW.restype = POINTER(LPWSTR)
|
||||
|
||||
cmd = GetCommandLineW()
|
||||
argc = c_int(0)
|
||||
argv = CommandLineToArgvW(cmd, byref(argc))
|
||||
if argc.value > 0:
|
||||
# Remove Python executable and commands if present
|
||||
start = argc.value - len(sys.argv)
|
||||
return [argv[i] for i in
|
||||
range(start, argc.value)]
|
||||
# this should never happen
|
||||
return None
|
||||
else:
|
||||
argv = []
|
||||
argvencoding = sys.stdin.encoding
|
||||
if argvencoding is None:
|
||||
argvencoding = sys.getfilesystemencoding()
|
||||
if argvencoding is None:
|
||||
argvencoding = 'utf-8'
|
||||
for arg in sys.argv:
|
||||
if isinstance(arg, text_type):
|
||||
argv.append(arg)
|
||||
else:
|
||||
argv.append(arg.decode(argvencoding))
|
||||
return argv
|
||||
|
||||
|
||||
# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
|
||||
def add_cp65001_codec():
|
||||
if PY2:
|
||||
try:
|
||||
codecs.lookup('cp65001')
|
||||
except LookupError:
|
||||
codecs.register(
|
||||
lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
|
||||
return
|
Reference in New Issue
Block a user