Files
ChocolateyPackages/EthanBrown.SublimeText2.UtilPackages/tools/PackageCache/EncodingHelper/EncodingHelper.py
2013-04-04 08:54:47 -04:00

300 lines
10 KiB
Python

# coding=utf8
import sublime, sublime_plugin
import codecs
import sys
import os
# sys.path.append(os.path.join(sublime.packages_path(), 'EncodingHelper', 'chardet'))
from chardet.universaldetector import UniversalDetector
import re
import threading
import time
# don't parse binary files, just mark these as binary
#IMAGES-------------------------------------#sublime---------------#fonts-----------#compressed----------------#audio-video--------------------------------------------------#docs------------------------------#misc
BINARY = re.compile('\.(apng|png|jpg|gif|jpeg|bmp|psd|ai|cdr|ico|cache|sublime-package|eot|svgz|ttf|woff|zip|tar|gz|rar|bz2|jar|xpi|mov|mpeg|avi|mpg|flv|wmv|mp3|wav|aif|aiff|snd|wma|asf|asx|pcm|pdf|doc|docx|xls|xlsx|ppt|pptx|rtf|sqlite|sqlitedb|fla|swf|exe)$', re.I);
SETTINGS = sublime.load_settings('EncodingHelper.sublime-settings')
class EncodingOnStatusBarListener(sublime_plugin.EventListener):
def on_load(self, v, ok = True):
if not v:
return
if v.encoding() == 'Undefined' and ok:
# give time to sublime just one time
sublime.set_timeout(lambda:self.on_load(v, False), 120)
return
elif v.encoding() == 'Undefined' and not ok:
v.settings().set('encoding_helper_encoding_sublime', 'UTF-8')
# if enabled, show encoding on status bar
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
# mark as loading
v.settings().set('encoding_helper_loading', True)
if not v.settings().has('encoding_helper_encoding_sublime'):
v.settings().set('encoding_helper_encoding_sublime', v.encoding())
#has cached state?
if v.settings().has('encoding_helper_encoding'):
v.settings().erase('encoding_helper_loading')
encoding = v.settings().get('encoding_helper_encoding')
encoding_sublime = v.settings().get('encoding_helper_encoding_sublime')
v.set_status('encoding_helper_statusbar', encoding)
if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown':
v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)')
else:
# is the file is there
file_name = v.file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
v.settings().erase('encoding_helper_loading')
v.set_status('encoding_helper_statusbar', '');
#guess
else:
v.set_status('encoding_helper_statusbar', '');
#print 'GuessEncoding'
GuessEncoding(file_name, SETTINGS.get('fallback_encodings'), v).start()
else:
v.erase_status('encoding_helper_statusbar')
def on_activated(self, v):
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
if v.settings().has('encoding_helper_loading'):
pass
else:
if not v.is_loading():
self.on_load(v)
else:
v.erase_status('encoding_helper_statusbar')
SETTINGS.add_on_change('reload', lambda: EncodingOnStatusBarListener().on_load(sublime.active_window().active_view()))
class GuessEncoding(threading.Thread):
def __init__(self, file_name, fallback_encodings = [], v = False, callback = False):
threading.Thread.__init__(self)
self.file_name = file_name
encoding_list = []
for encoding in fallback_encodings:
if encoding != 'ISO88591' and encoding != 'iso88591' and encoding != 'iso-8859-1' and encoding != 'ISO-8859-1':
encoding_list.append(encoding)
self.fallback_encodings = encoding_list
self.v = v
if callback == False:
self.callback = self.on_done
else:
self.callback = callback
def run(self):
confidence = 0
size = os.stat(self.file_name).st_size
if BINARY.search(self.file_name):
encoding = 'BINARY'
confidence = 1
elif size > 1048576 and maybe_binary(self.file_name):
encoding = 'BINARY'
confidence = 0.7
elif size > 1048576: # skip files > 1Mb
encoding = 'Unknown'
confidence = 1
else:
started_at = time.time()
timeout = False
detector = UniversalDetector()
fp = open(self.file_name, 'rb')
line = fp.readline(500)
while line != '':
detector.feed(line)
if time.time() - started_at > 8:
timeout = True
break
line = fp.readline(8000)
fp.close()
detector.close()
if timeout == False or (timeout == True and detector.done):
encoding = str(detector.result['encoding']).upper()
confidence = detector.result['confidence']
else:
encoding = 'Unknown'
confidence = 1
if encoding == 'ASCII':
encoding = 'UTF-8'
elif encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7:
if encoding == 'ISO-8859-2' and confidence > 0.69:
workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1'])
if workaround != False:
encoding = workaround
else:
encoding = 'Unknown'
elif encoding != 'ISO-8859-2' and confidence > 0.49:
if encoding == 'WINDOWS-1252':
encoding = 'ISO-8859-1'
else:
fallback = self.test_fallback_encodings()
if fallback == False:
encoding = 'Unknown'
else:
encoding = fallback
# workarounds here
if encoding == 'ISO-8859-2' or encoding == 'MACCYRILLIC':
workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1'])
if workaround != False:
encoding = workaround
del detector
sublime.set_timeout(lambda:self.callback(encoding, confidence), 0)
def test_fallback_encodings(self, encodings = False):
if encodings == False:
encodings = self.fallback_encodings
for encoding in encodings:
_encoding = translateCodec(encoding.lower())
try:
fp = codecs.open(self.file_name, "rb", _encoding, errors='strict')
line = fp.readline(500)
while line != '':
line = fp.readline(8000)
fp.close()
return encoding
except UnicodeDecodeError:
fp.close()
return False
def on_done(self, encoding, confidence):
if self.v:
self.v.settings().set('encoding_helper_encoding', encoding)
self.v.settings().set('encoding_helper_confidence', confidence)
self.v.set_status('encoding_helper_statusbar', encoding)
if not self.v.settings().has('encoding_helper_encoding_sublime'):
self.v.settings().set('encoding_helper_encoding_sublime', self.v.encoding())
encoding_sublime = self.v.settings().get('encoding_helper_encoding_sublime')
if encoding in SETTINGS.get('open_automatically_as_utf8', []) and self.v.is_dirty() == False:
ConvertToUTF8(self.file_name, encoding, self.v).start()
else:
if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown':
self.v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)')
self.v.settings().erase('encoding_helper_loading')
class Toutf8fromBestGuessCommand(sublime_plugin.WindowCommand):
def run(self):
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
Toutf8fromCommand(sublime_plugin.WindowCommand).run(encoding)
def description(self):
try:
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
return 'Convert to UTF-8 From '+encoding
else:
return 'Convert to UTF-8 From Best Guess'
except:
return 'Convert to UTF-8 From Best Guess'
def is_enabled(self):
try:
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
return True
except:
return False
class Toutf8fromCommand(sublime_plugin.WindowCommand):
def run(self, encoding = ''):
try:
if encoding == None or encoding == 'UTF-8' or encoding == 'BINARY' or encoding == 'Unknown' or encoding == '':
return False
v = sublime.active_window().active_view()
file_name = v.file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
return False
else:
ConvertToUTF8(file_name, encoding, v).start()
return True
except:
return False
def is_enabled(self, encoding = ''):
try:
file_name = sublime.active_window().active_view().file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
return False
else:
return True
except:
return False
class ConvertToUTF8(threading.Thread):
def __init__(self, file_name, encoding, v = False, callback = False):
threading.Thread.__init__(self)
self.file_name = file_name
self.encoding = encoding
self.v = v
if callback == False:
self.callback = self.on_done
else:
self.callback = callback
def run(self):
_encoding = translateCodec(self.encoding.lower())
try:
content = codecs.open(self.file_name, "rb", _encoding, errors='strict').read()
if len(content) != 0:
sublime.set_timeout(lambda:self.callback(content, self.encoding), 0)
except UnicodeDecodeError, e:
print e
sublime.set_timeout(lambda:self.on_error(self.file_name, self.encoding), 0)
except LookupError, e:
print e
sublime.set_timeout(lambda:self.on_lookup_error(self.file_name, self.encoding), 0)
def on_done(self, content, encoding):
if self.v:
edit = self.v.begin_edit()
self.v.replace(edit, sublime.Region(0, self.v.size()), content);
self.v.end_edit(edit)
self.v.settings().set('encoding_helper_encoding_sublime', 'UTF-8')
self.v.settings().set('encoding_helper_encoding', 'UTF-8')
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
self.v.set_status('encoding_helper_statusbar', 'UTF-8')
self.v.set_status('encoding_helper_statusbar_convertion_status', 'Converted to UTF-8 from '+encoding)
def on_error(self, file_name, encoding):
sublime.error_message('Unable to convert to UTF-8 from encoding "'+encoding+'" the file: \n'+file_name);
def on_lookup_error(self, file_name, encoding):
sublime.error_message('The encoding "'+encoding+'" is unknown in this system.\n Unable to convert to UTF-8 the file: \n'+file_name);
def maybe_binary(file_name):
fp = open(file_name, 'rb')
line = fp.readline(500)
read = 500
while line != '':
if '\0' in line:
fp.close()
return True
read += 8000
if read > 1048576:
fp.close()
return False
line = fp.readline(8000)
fp.close()
return False
# should map different codecs to what codec.open except to receive
def translateCodec(encoding):
return str(encoding)