300 lines
10 KiB
Python
300 lines
10 KiB
Python
# coding=utf8
|
|
import sublime, sublime_plugin
|
|
import codecs
|
|
import sys
|
|
import os
|
|
# sys.path.append(os.path.join(sublime.packages_path(), 'EncodingHelper', 'chardet'))
|
|
from chardet.universaldetector import UniversalDetector
|
|
import re
|
|
import threading
|
|
import time
|
|
|
|
# don't parse binary files, just mark these as binary
|
|
#IMAGES-------------------------------------#sublime---------------#fonts-----------#compressed----------------#audio-video--------------------------------------------------#docs------------------------------#misc
|
|
BINARY = re.compile('\.(apng|png|jpg|gif|jpeg|bmp|psd|ai|cdr|ico|cache|sublime-package|eot|svgz|ttf|woff|zip|tar|gz|rar|bz2|jar|xpi|mov|mpeg|avi|mpg|flv|wmv|mp3|wav|aif|aiff|snd|wma|asf|asx|pcm|pdf|doc|docx|xls|xlsx|ppt|pptx|rtf|sqlite|sqlitedb|fla|swf|exe)$', re.I);
|
|
|
|
SETTINGS = sublime.load_settings('EncodingHelper.sublime-settings')
|
|
|
|
class EncodingOnStatusBarListener(sublime_plugin.EventListener):
|
|
|
|
def on_load(self, v, ok = True):
|
|
|
|
if not v:
|
|
return
|
|
if v.encoding() == 'Undefined' and ok:
|
|
# give time to sublime just one time
|
|
sublime.set_timeout(lambda:self.on_load(v, False), 120)
|
|
return
|
|
elif v.encoding() == 'Undefined' and not ok:
|
|
v.settings().set('encoding_helper_encoding_sublime', 'UTF-8')
|
|
|
|
# if enabled, show encoding on status bar
|
|
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
|
|
|
|
# mark as loading
|
|
v.settings().set('encoding_helper_loading', True)
|
|
|
|
if not v.settings().has('encoding_helper_encoding_sublime'):
|
|
v.settings().set('encoding_helper_encoding_sublime', v.encoding())
|
|
|
|
#has cached state?
|
|
if v.settings().has('encoding_helper_encoding'):
|
|
v.settings().erase('encoding_helper_loading')
|
|
encoding = v.settings().get('encoding_helper_encoding')
|
|
encoding_sublime = v.settings().get('encoding_helper_encoding_sublime')
|
|
v.set_status('encoding_helper_statusbar', encoding)
|
|
if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown':
|
|
v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)')
|
|
else:
|
|
# is the file is there
|
|
file_name = v.file_name()
|
|
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
|
|
v.settings().erase('encoding_helper_loading')
|
|
v.set_status('encoding_helper_statusbar', '');
|
|
#guess
|
|
else:
|
|
v.set_status('encoding_helper_statusbar', '');
|
|
#print 'GuessEncoding'
|
|
GuessEncoding(file_name, SETTINGS.get('fallback_encodings'), v).start()
|
|
else:
|
|
v.erase_status('encoding_helper_statusbar')
|
|
|
|
def on_activated(self, v):
|
|
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
|
|
if v.settings().has('encoding_helper_loading'):
|
|
pass
|
|
else:
|
|
if not v.is_loading():
|
|
self.on_load(v)
|
|
else:
|
|
v.erase_status('encoding_helper_statusbar')
|
|
|
|
SETTINGS.add_on_change('reload', lambda: EncodingOnStatusBarListener().on_load(sublime.active_window().active_view()))
|
|
|
|
class GuessEncoding(threading.Thread):
|
|
|
|
def __init__(self, file_name, fallback_encodings = [], v = False, callback = False):
|
|
threading.Thread.__init__(self)
|
|
self.file_name = file_name
|
|
|
|
encoding_list = []
|
|
for encoding in fallback_encodings:
|
|
if encoding != 'ISO88591' and encoding != 'iso88591' and encoding != 'iso-8859-1' and encoding != 'ISO-8859-1':
|
|
encoding_list.append(encoding)
|
|
self.fallback_encodings = encoding_list
|
|
|
|
self.v = v
|
|
if callback == False:
|
|
self.callback = self.on_done
|
|
else:
|
|
self.callback = callback
|
|
|
|
def run(self):
|
|
confidence = 0
|
|
size = os.stat(self.file_name).st_size
|
|
if BINARY.search(self.file_name):
|
|
encoding = 'BINARY'
|
|
confidence = 1
|
|
elif size > 1048576 and maybe_binary(self.file_name):
|
|
encoding = 'BINARY'
|
|
confidence = 0.7
|
|
elif size > 1048576: # skip files > 1Mb
|
|
encoding = 'Unknown'
|
|
confidence = 1
|
|
else:
|
|
started_at = time.time()
|
|
timeout = False
|
|
|
|
detector = UniversalDetector()
|
|
fp = open(self.file_name, 'rb')
|
|
line = fp.readline(500)
|
|
while line != '':
|
|
detector.feed(line)
|
|
if time.time() - started_at > 8:
|
|
timeout = True
|
|
break
|
|
line = fp.readline(8000)
|
|
fp.close()
|
|
detector.close()
|
|
if timeout == False or (timeout == True and detector.done):
|
|
encoding = str(detector.result['encoding']).upper()
|
|
confidence = detector.result['confidence']
|
|
else:
|
|
encoding = 'Unknown'
|
|
confidence = 1
|
|
|
|
if encoding == 'ASCII':
|
|
encoding = 'UTF-8'
|
|
elif encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7:
|
|
if encoding == 'ISO-8859-2' and confidence > 0.69:
|
|
workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1'])
|
|
if workaround != False:
|
|
encoding = workaround
|
|
else:
|
|
encoding = 'Unknown'
|
|
elif encoding != 'ISO-8859-2' and confidence > 0.49:
|
|
if encoding == 'WINDOWS-1252':
|
|
encoding = 'ISO-8859-1'
|
|
else:
|
|
fallback = self.test_fallback_encodings()
|
|
if fallback == False:
|
|
encoding = 'Unknown'
|
|
else:
|
|
encoding = fallback
|
|
|
|
# workarounds here
|
|
if encoding == 'ISO-8859-2' or encoding == 'MACCYRILLIC':
|
|
workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1'])
|
|
if workaround != False:
|
|
encoding = workaround
|
|
|
|
del detector
|
|
sublime.set_timeout(lambda:self.callback(encoding, confidence), 0)
|
|
|
|
def test_fallback_encodings(self, encodings = False):
|
|
if encodings == False:
|
|
encodings = self.fallback_encodings
|
|
for encoding in encodings:
|
|
_encoding = translateCodec(encoding.lower())
|
|
try:
|
|
fp = codecs.open(self.file_name, "rb", _encoding, errors='strict')
|
|
line = fp.readline(500)
|
|
while line != '':
|
|
line = fp.readline(8000)
|
|
fp.close()
|
|
return encoding
|
|
except UnicodeDecodeError:
|
|
fp.close()
|
|
return False
|
|
|
|
def on_done(self, encoding, confidence):
|
|
if self.v:
|
|
self.v.settings().set('encoding_helper_encoding', encoding)
|
|
self.v.settings().set('encoding_helper_confidence', confidence)
|
|
self.v.set_status('encoding_helper_statusbar', encoding)
|
|
|
|
if not self.v.settings().has('encoding_helper_encoding_sublime'):
|
|
self.v.settings().set('encoding_helper_encoding_sublime', self.v.encoding())
|
|
encoding_sublime = self.v.settings().get('encoding_helper_encoding_sublime')
|
|
|
|
if encoding in SETTINGS.get('open_automatically_as_utf8', []) and self.v.is_dirty() == False:
|
|
ConvertToUTF8(self.file_name, encoding, self.v).start()
|
|
else:
|
|
if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown':
|
|
self.v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)')
|
|
self.v.settings().erase('encoding_helper_loading')
|
|
|
|
class Toutf8fromBestGuessCommand(sublime_plugin.WindowCommand):
|
|
|
|
def run(self):
|
|
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
|
|
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
|
|
Toutf8fromCommand(sublime_plugin.WindowCommand).run(encoding)
|
|
|
|
def description(self):
|
|
try:
|
|
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
|
|
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
|
|
return 'Convert to UTF-8 From '+encoding
|
|
else:
|
|
return 'Convert to UTF-8 From Best Guess'
|
|
except:
|
|
return 'Convert to UTF-8 From Best Guess'
|
|
|
|
def is_enabled(self):
|
|
try:
|
|
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
|
|
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '':
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
class Toutf8fromCommand(sublime_plugin.WindowCommand):
|
|
|
|
def run(self, encoding = ''):
|
|
try:
|
|
if encoding == None or encoding == 'UTF-8' or encoding == 'BINARY' or encoding == 'Unknown' or encoding == '':
|
|
return False
|
|
v = sublime.active_window().active_view()
|
|
file_name = v.file_name()
|
|
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
|
|
return False
|
|
else:
|
|
ConvertToUTF8(file_name, encoding, v).start()
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def is_enabled(self, encoding = ''):
|
|
try:
|
|
file_name = sublime.active_window().active_view().file_name()
|
|
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
|
|
return False
|
|
else:
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
class ConvertToUTF8(threading.Thread):
|
|
|
|
def __init__(self, file_name, encoding, v = False, callback = False):
|
|
threading.Thread.__init__(self)
|
|
self.file_name = file_name
|
|
|
|
self.encoding = encoding
|
|
self.v = v
|
|
if callback == False:
|
|
self.callback = self.on_done
|
|
else:
|
|
self.callback = callback
|
|
|
|
def run(self):
|
|
_encoding = translateCodec(self.encoding.lower())
|
|
try:
|
|
content = codecs.open(self.file_name, "rb", _encoding, errors='strict').read()
|
|
if len(content) != 0:
|
|
sublime.set_timeout(lambda:self.callback(content, self.encoding), 0)
|
|
except UnicodeDecodeError, e:
|
|
print e
|
|
sublime.set_timeout(lambda:self.on_error(self.file_name, self.encoding), 0)
|
|
except LookupError, e:
|
|
print e
|
|
sublime.set_timeout(lambda:self.on_lookup_error(self.file_name, self.encoding), 0)
|
|
|
|
def on_done(self, content, encoding):
|
|
if self.v:
|
|
edit = self.v.begin_edit()
|
|
self.v.replace(edit, sublime.Region(0, self.v.size()), content);
|
|
self.v.end_edit(edit)
|
|
self.v.settings().set('encoding_helper_encoding_sublime', 'UTF-8')
|
|
self.v.settings().set('encoding_helper_encoding', 'UTF-8')
|
|
if bool(SETTINGS.get('show_encoding_on_status_bar', True)):
|
|
self.v.set_status('encoding_helper_statusbar', 'UTF-8')
|
|
self.v.set_status('encoding_helper_statusbar_convertion_status', 'Converted to UTF-8 from '+encoding)
|
|
|
|
def on_error(self, file_name, encoding):
|
|
sublime.error_message('Unable to convert to UTF-8 from encoding "'+encoding+'" the file: \n'+file_name);
|
|
|
|
def on_lookup_error(self, file_name, encoding):
|
|
sublime.error_message('The encoding "'+encoding+'" is unknown in this system.\n Unable to convert to UTF-8 the file: \n'+file_name);
|
|
|
|
def maybe_binary(file_name):
|
|
fp = open(file_name, 'rb')
|
|
line = fp.readline(500)
|
|
read = 500
|
|
while line != '':
|
|
if '\0' in line:
|
|
fp.close()
|
|
return True
|
|
read += 8000
|
|
if read > 1048576:
|
|
fp.close()
|
|
return False
|
|
line = fp.readline(8000)
|
|
fp.close()
|
|
return False
|
|
|
|
# should map different codecs to what codec.open except to receive
|
|
def translateCodec(encoding):
|
|
return str(encoding)
|