# coding=utf8 import sublime, sublime_plugin import codecs import sys import os # sys.path.append(os.path.join(sublime.packages_path(), 'EncodingHelper', 'chardet')) from chardet.universaldetector import UniversalDetector import re import threading import time # don't parse binary files, just mark these as binary #IMAGES-------------------------------------#sublime---------------#fonts-----------#compressed----------------#audio-video--------------------------------------------------#docs------------------------------#misc BINARY = re.compile('\.(apng|png|jpg|gif|jpeg|bmp|psd|ai|cdr|ico|cache|sublime-package|eot|svgz|ttf|woff|zip|tar|gz|rar|bz2|jar|xpi|mov|mpeg|avi|mpg|flv|wmv|mp3|wav|aif|aiff|snd|wma|asf|asx|pcm|pdf|doc|docx|xls|xlsx|ppt|pptx|rtf|sqlite|sqlitedb|fla|swf|exe)$', re.I); SETTINGS = sublime.load_settings('EncodingHelper.sublime-settings') class EncodingOnStatusBarListener(sublime_plugin.EventListener): def on_load(self, v, ok = True): if not v: return if v.encoding() == 'Undefined' and ok: # give time to sublime just one time sublime.set_timeout(lambda:self.on_load(v, False), 120) return elif v.encoding() == 'Undefined' and not ok: v.settings().set('encoding_helper_encoding_sublime', 'UTF-8') # if enabled, show encoding on status bar if bool(SETTINGS.get('show_encoding_on_status_bar', True)): # mark as loading v.settings().set('encoding_helper_loading', True) if not v.settings().has('encoding_helper_encoding_sublime'): v.settings().set('encoding_helper_encoding_sublime', v.encoding()) #has cached state? if v.settings().has('encoding_helper_encoding'): v.settings().erase('encoding_helper_loading') encoding = v.settings().get('encoding_helper_encoding') encoding_sublime = v.settings().get('encoding_helper_encoding_sublime') v.set_status('encoding_helper_statusbar', encoding) if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown': v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)') else: # is the file is there file_name = v.file_name() if not file_name or file_name == '' or os.path.isfile(file_name) == False: v.settings().erase('encoding_helper_loading') v.set_status('encoding_helper_statusbar', ''); #guess else: v.set_status('encoding_helper_statusbar', ''); #print 'GuessEncoding' GuessEncoding(file_name, SETTINGS.get('fallback_encodings'), v).start() else: v.erase_status('encoding_helper_statusbar') def on_activated(self, v): if bool(SETTINGS.get('show_encoding_on_status_bar', True)): if v.settings().has('encoding_helper_loading'): pass else: if not v.is_loading(): self.on_load(v) else: v.erase_status('encoding_helper_statusbar') SETTINGS.add_on_change('reload', lambda: EncodingOnStatusBarListener().on_load(sublime.active_window().active_view())) class GuessEncoding(threading.Thread): def __init__(self, file_name, fallback_encodings = [], v = False, callback = False): threading.Thread.__init__(self) self.file_name = file_name encoding_list = [] for encoding in fallback_encodings: if encoding != 'ISO88591' and encoding != 'iso88591' and encoding != 'iso-8859-1' and encoding != 'ISO-8859-1': encoding_list.append(encoding) self.fallback_encodings = encoding_list self.v = v if callback == False: self.callback = self.on_done else: self.callback = callback def run(self): confidence = 0 size = os.stat(self.file_name).st_size if BINARY.search(self.file_name): encoding = 'BINARY' confidence = 1 elif size > 1048576 and maybe_binary(self.file_name): encoding = 'BINARY' confidence = 0.7 elif size > 1048576: # skip files > 1Mb encoding = 'Unknown' confidence = 1 else: started_at = time.time() timeout = False detector = UniversalDetector() fp = open(self.file_name, 'rb') line = fp.readline(500) while line != '': detector.feed(line) if time.time() - started_at > 8: timeout = True break line = fp.readline(8000) fp.close() detector.close() if timeout == False or (timeout == True and detector.done): encoding = str(detector.result['encoding']).upper() confidence = detector.result['confidence'] else: encoding = 'Unknown' confidence = 1 if encoding == 'ASCII': encoding = 'UTF-8' elif encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7: if encoding == 'ISO-8859-2' and confidence > 0.69: workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1']) if workaround != False: encoding = workaround else: encoding = 'Unknown' elif encoding != 'ISO-8859-2' and confidence > 0.49: if encoding == 'WINDOWS-1252': encoding = 'ISO-8859-1' else: fallback = self.test_fallback_encodings() if fallback == False: encoding = 'Unknown' else: encoding = fallback # workarounds here if encoding == 'ISO-8859-2' or encoding == 'MACCYRILLIC': workaround = self.test_fallback_encodings(['UTF-8', 'ISO-8859-1']) if workaround != False: encoding = workaround del detector sublime.set_timeout(lambda:self.callback(encoding, confidence), 0) def test_fallback_encodings(self, encodings = False): if encodings == False: encodings = self.fallback_encodings for encoding in encodings: _encoding = translateCodec(encoding.lower()) try: fp = codecs.open(self.file_name, "rb", _encoding, errors='strict') line = fp.readline(500) while line != '': line = fp.readline(8000) fp.close() return encoding except UnicodeDecodeError: fp.close() return False def on_done(self, encoding, confidence): if self.v: self.v.settings().set('encoding_helper_encoding', encoding) self.v.settings().set('encoding_helper_confidence', confidence) self.v.set_status('encoding_helper_statusbar', encoding) if not self.v.settings().has('encoding_helper_encoding_sublime'): self.v.settings().set('encoding_helper_encoding_sublime', self.v.encoding()) encoding_sublime = self.v.settings().get('encoding_helper_encoding_sublime') if encoding in SETTINGS.get('open_automatically_as_utf8', []) and self.v.is_dirty() == False: ConvertToUTF8(self.file_name, encoding, self.v).start() else: if encoding_sublime != '' and encoding_sublime != encoding and encoding != 'BINARY' and encoding != 'Unknown': self.v.set_status('encoding_helper_statusbar_convertion_status', 'Opened as '+encoding_sublime+' (document maybe broken)') self.v.settings().erase('encoding_helper_loading') class Toutf8fromBestGuessCommand(sublime_plugin.WindowCommand): def run(self): encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding') if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '': Toutf8fromCommand(sublime_plugin.WindowCommand).run(encoding) def description(self): try: encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding') if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '': return 'Convert to UTF-8 From '+encoding else: return 'Convert to UTF-8 From Best Guess' except: return 'Convert to UTF-8 From Best Guess' def is_enabled(self): try: encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding') if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '': return True except: return False class Toutf8fromCommand(sublime_plugin.WindowCommand): def run(self, encoding = ''): try: if encoding == None or encoding == 'UTF-8' or encoding == 'BINARY' or encoding == 'Unknown' or encoding == '': return False v = sublime.active_window().active_view() file_name = v.file_name() if not file_name or file_name == '' or os.path.isfile(file_name) == False: return False else: ConvertToUTF8(file_name, encoding, v).start() return True except: return False def is_enabled(self, encoding = ''): try: file_name = sublime.active_window().active_view().file_name() if not file_name or file_name == '' or os.path.isfile(file_name) == False: return False else: return True except: return False class ConvertToUTF8(threading.Thread): def __init__(self, file_name, encoding, v = False, callback = False): threading.Thread.__init__(self) self.file_name = file_name self.encoding = encoding self.v = v if callback == False: self.callback = self.on_done else: self.callback = callback def run(self): _encoding = translateCodec(self.encoding.lower()) try: content = codecs.open(self.file_name, "rb", _encoding, errors='strict').read() if len(content) != 0: sublime.set_timeout(lambda:self.callback(content, self.encoding), 0) except UnicodeDecodeError, e: print e sublime.set_timeout(lambda:self.on_error(self.file_name, self.encoding), 0) except LookupError, e: print e sublime.set_timeout(lambda:self.on_lookup_error(self.file_name, self.encoding), 0) def on_done(self, content, encoding): if self.v: edit = self.v.begin_edit() self.v.replace(edit, sublime.Region(0, self.v.size()), content); self.v.end_edit(edit) self.v.settings().set('encoding_helper_encoding_sublime', 'UTF-8') self.v.settings().set('encoding_helper_encoding', 'UTF-8') if bool(SETTINGS.get('show_encoding_on_status_bar', True)): self.v.set_status('encoding_helper_statusbar', 'UTF-8') self.v.set_status('encoding_helper_statusbar_convertion_status', 'Converted to UTF-8 from '+encoding) def on_error(self, file_name, encoding): sublime.error_message('Unable to convert to UTF-8 from encoding "'+encoding+'" the file: \n'+file_name); def on_lookup_error(self, file_name, encoding): sublime.error_message('The encoding "'+encoding+'" is unknown in this system.\n Unable to convert to UTF-8 the file: \n'+file_name); def maybe_binary(file_name): fp = open(file_name, 'rb') line = fp.readline(500) read = 500 while line != '': if '\0' in line: fp.close() return True read += 8000 if read > 1048576: fp.close() return False line = fp.readline(8000) fp.close() return False # should map different codecs to what codec.open except to receive def translateCodec(encoding): return str(encoding)