encoding.py 2.8 KB
Newer Older
Stelios Karozis's avatar
Stelios Karozis committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
# coding: utf-8
"""
Utilities for dealing with text encodings
"""

#-----------------------------------------------------------------------------
#  Copyright (C) 2008-2012  The IPython Development Team
#
#  Distributed under the terms of the BSD License.  The full license is in
#  the file COPYING, distributed as part of this software.
#-----------------------------------------------------------------------------

#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
import sys
import locale
import warnings

# to deal with the possibility of sys.std* not being a stream at all
def get_stream_enc(stream, default=None):
    """Return the given stream's encoding or a default.

    There are cases where ``sys.std*`` might not actually be a stream, so
    check for the encoding attribute prior to returning it, and return
    a default if it doesn't exist or evaluates as False. ``default``
    is None if not provided.
    """
    if not hasattr(stream, 'encoding') or not stream.encoding:
        return default
    else:
        return stream.encoding

# Less conservative replacement for sys.getdefaultencoding, that will try
# to match the environment.
# Defined here as central function, so if we find better choices, we
# won't need to make changes all over IPython.
def getdefaultencoding(prefer_stream=True):
    """Return IPython's guess for the default encoding for bytes as text.
    
    If prefer_stream is True (default), asks for stdin.encoding first,
    to match the calling Terminal, but that is often None for subprocesses.
    
    Then fall back on locale.getpreferredencoding(),
    which should be a sensible platform default (that respects LANG environment),
    and finally to sys.getdefaultencoding() which is the most conservative option,
    and usually ASCII on Python 2 or UTF8 on Python 3.
    """
    enc = None
    if prefer_stream:
        enc = get_stream_enc(sys.stdin)
    if not enc or enc=='ascii':
        try:
            # There are reports of getpreferredencoding raising errors
            # in some cases, which may well be fixed, but let's be conservative here.
            enc = locale.getpreferredencoding()
        except Exception:
            pass
    enc = enc or sys.getdefaultencoding()
    # On windows `cp0` can be returned to indicate that there is no code page.
    # Since cp0 is an invalid encoding return instead cp1252 which is the
    # Western European default.
    if enc == 'cp0':
        warnings.warn(
            "Invalid code page cp0 detected - using cp1252 instead."
            "If cp1252 is incorrect please ensure a valid code page "
            "is defined for the process.", RuntimeWarning)
        return 'cp1252'
    return enc

DEFAULT_ENCODING = getdefaultencoding()