86>Aug 1 09:48:00 userdel[2249300]: delete user 'rooter' <86>Aug 1 09:48:00 userdel[2249300]: removed group 'rooter' owned by 'rooter' <86>Aug 1 09:48:00 userdel[2249300]: removed shadow group 'rooter' owned by 'rooter' <86>Aug 1 09:48:00 groupadd[2249308]: group added to /etc/group: name=rooter, GID=639 <86>Aug 1 09:48:00 groupadd[2249308]: group added to /etc/gshadow: name=rooter <86>Aug 1 09:48:00 groupadd[2249308]: new group: name=rooter, GID=639 <86>Aug 1 09:48:00 useradd[2249313]: new user: name=rooter, UID=639, GID=639, home=/root, shell=/bin/bash <86>Aug 1 09:48:00 userdel[2249328]: delete user 'builder' <86>Aug 1 09:48:00 userdel[2249328]: removed group 'builder' owned by 'builder' <86>Aug 1 09:48:00 userdel[2249328]: removed shadow group 'builder' owned by 'builder' <86>Aug 1 09:48:00 groupadd[2249340]: group added to /etc/group: name=builder, GID=640 <86>Aug 1 09:48:00 groupadd[2249340]: group added to /etc/gshadow: name=builder <86>Aug 1 09:48:00 groupadd[2249340]: new group: name=builder, GID=640 <86>Aug 1 09:48:00 useradd[2249347]: new user: name=builder, UID=640, GID=640, home=/usr/src, shell=/bin/bash <13>Aug 1 09:48:04 rpmi: rpm-build-compat-2.2.6-alt2 p9+263465.40.2.1 1608016546 installed <13>Aug 1 09:48:04 rpmi: libgdbm-1.8.3-alt10 1454943334 installed <13>Aug 1 09:48:04 rpmi: libexpat-2.2.10-alt1 p9+261554.100.1.1 1605103337 installed <13>Aug 1 09:48:04 rpmi: libp11-kit-0.23.15-alt2 p9+254920.2400.19.1 1601385903 installed <13>Aug 1 09:48:04 rpmi: libtasn1-4.14-alt1 p9+235792.100.2.1 1565425233 installed <13>Aug 1 09:48:04 rpmi: rpm-macros-alternatives-0.5.1-alt1 sisyphus+226946.100.1.1 1554830426 installed <13>Aug 1 09:48:04 rpmi: alternatives-0.5.1-alt1 sisyphus+226946.100.1.1 1554830426 installed <13>Aug 1 09:48:04 rpmi: ca-certificates-2020.06.29-alt1 p9+258899.100.3.1 1601998604 installed <13>Aug 1 09:48:04 rpmi: ca-trust-0.1.2-alt1 p9+233349.100.1.1 1561655062 installed <13>Aug 1 09:48:04 rpmi: p11-kit-trust-0.23.15-alt2 p9+254920.2400.19.1 1601385903 installed <13>Aug 1 09:48:04 rpmi: libcrypto1.1-1.1.1k-alt1 p9+268376.100.3.1 1616721011 installed <13>Aug 1 09:48:04 rpmi: libssl1.1-1.1.1k-alt1 p9+268376.100.3.1 1616721011 installed <13>Aug 1 09:48:04 rpmi: python3-3.7.4-alt3 p9+249932.100.2.1 1587127349 installed <13>Aug 1 09:48:04 rpmi: libpython3-3.7.4-alt3 p9+249932.100.2.1 1587127349 installed <13>Aug 1 09:48:05 rpmi: python3-base-3.7.4-alt3 p9+249932.100.2.1 1587127349 installed <13>Aug 1 09:48:05 rpmi: tests-for-installed-python3-pkgs-0.1.13.1-alt2 1535450458 installed <13>Aug 1 09:48:05 rpmi: rpm-build-python3-0.1.13.1-alt2 1535450458 installed <13>Aug 1 09:48:05 rpmi: rpm-build-intro-2.2.6-alt2 p9+263465.40.2.1 1608016546 installed <13>Aug 1 09:48:09 rpmi: python3-module-six-1.12.0-alt1 sisyphus+219665.100.2.1 1548148570 installed <13>Aug 1 09:48:09 rpmi: python3-module-webencodings-0.5.1-alt1.1 1517943573 installed <13>Aug 1 09:48:09 rpmi: python3-module-genshi-0.7-alt2 sisyphus+229363.100.1.1 1557847335 installed <13>Aug 1 09:48:09 rpmi: python3-module-iniconfig-1.0.0-alt1 sisyphus+227494.1100.1.3 1555839106 installed <13>Aug 1 09:48:09 rpmi: python3-module-pluggy-0.11.0-alt1 sisyphus+229116.100.1.1 1557316655 installed <13>Aug 1 09:48:09 rpmi: python3-module-pkg_resources-1:40.8.0-alt2 sisyphus+229158.200.2.1 1557735221 installed <13>Aug 1 09:48:09 rpmi: python3-module-more-itertools-7.0.0-alt3 sisyphus+226536.100.1.1 1554329928 installed <13>Aug 1 09:48:09 rpmi: python3-module-attrs-19.3.0-alt1 p9+266900.5140.22.1 1618153287 installed <13>Aug 1 09:48:09 rpmi: python3-module-atomicwrites-1.3.0-alt1 sisyphus+227473.2000.1.3 1555763467 installed <13>Aug 1 09:48:09 rpmi: python3-module-apipkg-1.5-alt1 sisyphus+227465.1100.1.2 1555756555 installed <13>Aug 1 09:48:09 rpmi: python3-module-py-1.8.0-alt2 sisyphus+228349.1400.4.1 1556631070 installed <13>Aug 1 09:48:09 rpmi: python3-module-pytest-3.10.1-alt5 sisyphus+228366.2000.2.3 1556707871 installed <13>Aug 1 09:48:09 rpmi: libverto-0.3.0-alt1_7 sisyphus+225932.100.1.1 1553994919 installed <13>Aug 1 09:48:09 rpmi: libkeyutils-1.6-alt2 sisyphus+226520.100.2.1 1554512089 installed <13>Aug 1 09:48:09 rpmi: libgpg-error-1.36-alt1 p9+261942.140.4.1 1606237675 installed <13>Aug 1 09:48:09 rpmi: libgcrypt20-1.8.5-alt3 p9+261942.200.4.1 1606237923 installed <13>Aug 1 09:48:09 rpmi: libxslt-1.1.34-alt1.p9.1 p9+261811.300.3.1 1605513919 installed <13>Aug 1 09:48:09 rpmi: python3-module-cssselect-0.9.1-alt1.2 sisyphus+227479.1100.1.2 1555757152 installed <13>Aug 1 09:48:10 rpmi: python3-module-html5lib-1:0.999999999-alt4.qa1 sisyphus+227493.600.1.3 1555835341 installed <13>Aug 1 09:48:10 rpmi: python3-module-lxml-4.4.2-alt1 p9+244111.100.1.1 1578758819 installed <13>Aug 1 09:48:10 rpmi: python3-module-soupsieve-2.0.1-alt1 p9+274160.340.3.1 1624547632 installed <13>Aug 1 09:48:10 rpmi: python3-module-beautifulsoup4-4.9.3-alt1 p9+274160.400.3.1 1624547679 installed <13>Aug 1 09:48:10 rpmi: libcom_err-1.44.6-alt1 sisyphus+224154.100.1.1 1552091678 installed <86>Aug 1 09:48:10 groupadd[2253126]: group added to /etc/group: name=_keytab, GID=499 <86>Aug 1 09:48:10 groupadd[2253126]: group added to /etc/gshadow: name=_keytab <86>Aug 1 09:48:10 groupadd[2253126]: new group: name=_keytab, GID=499 <13>Aug 1 09:48:10 rpmi: libkrb5-1.17.2-alt2 p9+280683.100.2.1 1627489307 installed <13>Aug 1 09:48:10 rpmi: libtirpc-1.0.3-alt1 1532008017 installed <13>Aug 1 09:48:10 rpmi: libnsl2-1.1.0-alt1_1 1511548749 installed <13>Aug 1 09:48:10 rpmi: python-modules-encodings-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-compiler-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-email-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-unittest-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-nis-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-ctypes-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-multiprocessing-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-modules-logging-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed <13>Aug 1 09:48:10 rpmi: python-tools-2to3-2.7.16-alt1.M90P.2 p9+260393.40.3.1 1604003681 installed Building target platforms: i586 Building for target i586 Wrote: /usr/src/in/nosrpm/python3-module-beautifulsoup4-4.9.3-alt1.nosrc.rpm (w1.gzdio) <13>Aug 1 09:48:14 rpmi: libtinfo-devel-6.1.20180407-alt2 sisyphus+222164.200.1.1 1550686226 installed <13>Aug 1 09:48:14 rpmi: libncurses-devel-6.1.20180407-alt2 sisyphus+222164.200.1.1 1550686226 installed <13>Aug 1 09:48:14 rpmi: python3-dev-3.7.4-alt3 p9+249932.100.2.1 1587127349 installed <13>Aug 1 09:48:14 rpmi: python3-module-setuptools-1:40.8.0-alt2 sisyphus+229158.200.2.1 1557735221 installed Installing python3-module-beautifulsoup4-4.9.3-alt1.src.rpm Building target platforms: i586 Building for target i586 Executing(%prep): /bin/sh -e /usr/src/tmp/rpm-tmp.46442 + umask 022 + /bin/mkdir -p /usr/src/RPM/BUILD + cd /usr/src/RPM/BUILD + cd /usr/src/RPM/BUILD + rm -rf python3-module-beautifulsoup4-4.9.3 + echo 'Source #0 (python3-module-beautifulsoup4-4.9.3.tar):' Source #0 (python3-module-beautifulsoup4-4.9.3.tar): + /bin/tar -xf /usr/src/RPM/SOURCES/python3-module-beautifulsoup4-4.9.3.tar + cd python3-module-beautifulsoup4-4.9.3 + /bin/chmod -c -Rf u+rwX,go-w . + exit 0 Executing(%build): /bin/sh -e /usr/src/tmp/rpm-tmp.46442 + umask 022 + /bin/mkdir -p /usr/src/RPM/BUILD + cd /usr/src/RPM/BUILD + cd python3-module-beautifulsoup4-4.9.3 + export LC_ALL=en_US.UTF-8 + LC_ALL=en_US.UTF-8 + 2to3 -w bs4 RefactoringTool: Skipping optional fixer: buffer RefactoringTool: Skipping optional fixer: idioms RefactoringTool: Skipping optional fixer: set_literal RefactoringTool: Skipping optional fixer: ws_comma RefactoringTool: Refactored bs4/__init__.py --- bs4/__init__.py (original) +++ bs4/__init__.py (refactored) @@ -51,7 +51,7 @@ # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' # Define some custom warnings. class GuessedAtParserWarning(UserWarning): @@ -100,7 +100,7 @@ # Since BeautifulSoup subclasses Tag, it's possible to treat it as # a Tag with a .name. This name makes it clear the BeautifulSoup # object isn't a real markup tag. - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. @@ -217,7 +217,7 @@ from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") - if from_encoding and isinstance(markup, unicode): + if from_encoding and isinstance(markup, str): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None @@ -234,7 +234,7 @@ builder_class = builder builder = None elif builder is None: - if isinstance(features, basestring): + if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -309,13 +309,13 @@ markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, unicode) and not u'<' in markup) + or (isinstance(markup, str) and not '<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -323,7 +323,7 @@ is_file = False try: is_file = os.path.exists(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. @@ -353,9 +353,9 @@ pass if not success: - other_exceptions = [unicode(e) for e in rejections] + other_exceptions = [str(e) for e in rejections] raise ParserRejectedMarkup( - u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) ) # Clear out the markup and remove the builder's circular @@ -406,9 +406,9 @@ if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") - elif isinstance(markup, unicode): - space = u' ' - cant_start_with = (u"http:", u"https:") + elif isinstance(markup, str): + RefactoringTool: Refactored bs4/dammit.py space = ' ' + cant_start_with = ("http:", "https:") else: return @@ -545,7 +545,7 @@ containerClass = self.string_container(containerClass) if self.current_data: - current_data = u''.join(self.current_data) + current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. @@ -748,9 +748,9 @@ eventual_encoding = None if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'\n' % encoding_part + prefix = '\n' % encoding_part else: - prefix = u'' + prefix = '' if not pretty_print: indent_level = None else: @@ -788,4 +788,4 @@ if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print(soup.prettify()) + print((soup.prettify())) --- bs4/dammit.py (original) +++ bs4/dammit.py (refactored) @@ -10,7 +10,7 @@ __license__ = "MIT" import codecs -from htmlentitydefs import codepoint2name +from html.entities import codepoint2name import re import logging import string @@ -22,7 +22,7 @@ # PyPI package: cchardet import cchardet def chardet_dammit(s): - if isinstance(s, unicode): + if isinstance(s, str): return None return cchardet.detect(s)['encoding'] except ImportError: @@ -32,7 +32,7 @@ # PyPI package: chardet import chardet def chardet_dammit(s): - if isinstance(s, unicode): + if isinstance(s, str): return None return chardet.detect(s)['encoding'] #import chardet.constants @@ -53,14 +53,14 @@ # Build bytestring and Unicode versions of regular expressions for finding # a declared encoding inside an XML or HTML document. -xml_encoding = u'^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' -html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' +xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' encoding_res = dict() encoding_res[bytes] = { 'html' : re.compile(html_meta.encode("ascii"), re.I), 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), } -encoding_res[unicode] = { +encoding_res[str] = { 'html' : re.compile(html_meta, re.I), 'xml' : re.compile(xml_encoding, re.I) } @@ -80,7 +80,7 @@ # entities, but that's a little tricky. extra = [(39, 'apos')] for codepoint, name in list(codepoint2name.items()) + extra: - character = unichr(codepoint) + character = chr(codepoint) if codepoint not in (34, 39): # There's no point in turning the quotation mark into # " or the single quote into ', unless it @@ -323,7 +323,7 @@ :return: A 2-tuple (modified data, implied encoding) """ encoding = None - if isinstance(data, unicode): + if isinstance(data, str): # Unicode data cannot have a byte-order mark. return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ @@ -370,7 +370,7 @@ if isinstance(markup, bytes): res = encoding_res[bytes] else: - res = encoding_res[unicode] + res = encoding_res[str] xml_re = res['xml'] html_re = res['html'] @@ -431,9 +431,9 @@ markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, unicode) or markup == '': + if isinstance(markup, str) or markup == '': self.markup = markup - self.unicode_markup = unicode(markup) + self.unicode_markup = str(markup) RefactoringTool: Refactored bs4/diagnose.py self.original_encoding = None return @@ -523,7 +523,7 @@ :param encoding: The name of an encoding. """ - return unicode(data, encoding, errors) + return str(data, encoding, errors) @property def declared_html_encoding(self): --- bs4/diagnose.py (original) +++ bs4/diagnose.py (refactored) @@ -4,8 +4,8 @@ __license__ = "MIT" import cProfile -from StringIO import StringIO -from HTMLParser import HTMLParser +from io import StringIO +from html.parser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry @@ -25,8 +25,8 @@ :param data: A string containing markup that needs to be explained. :return: None; diagnostics are printed to standard output. """ - print("Diagnostic running on Beautiful Soup %s" % __version__) - print("Python version %s" % sys.version) + print(("Diagnostic running on Beautiful Soup %s" % __version__)) + print(("Python version %s" % sys.version)) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: @@ -35,16 +35,16 @@ break else: basic_parsers.remove(name) - print( + print(( "I noticed that %s is not installed. Installing it may help." % - name) + name)) if 'lxml' in basic_parsers: basic_parsers.append("lxml-xml") try: from lxml import etree - print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) - except ImportError, e: + print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) + except ImportError as e: print( "lxml is not installed or couldn't be imported.") @@ -52,21 +52,21 @@ if 'html5lib' in basic_parsers: try: import html5lib - print("Found html5lib version %s" % html5lib.__version__) - except ImportError, e: + print(("Found html5lib version %s" % html5lib.__version__)) + except ImportError as e: print( "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() elif data.startswith("http:") or data.startswith("https:"): - print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") return else: try: if os.path.exists(data): - print('"%s" looks like a filename. Reading data from the file.' % data) + print(('"%s" looks like a filename. Reading data from the file.' % data)) with open(data) as fp: data = fp.read() except ValueError: @@ -76,19 +76,19 @@ print("") for parser in basic_parsers: - print("Trying to parse your markup with %s" % parser) + print(("Trying to parse your markup with %s" % parser)) success = False try: soup = BeautifulSoup(data, features=parser) success = True - except Exception, e: - print("%s could not parse the markup." % parser) + except Exception as e: + print(("%s could not parse the markup." % parser)) traceback.print_exc() if success: - print("Here's what %s did with the markup:" % parser) - print(soup.prettify()) - - print("-" * 80) + print(("Here's what %s did with the markup:" % parser)) + print((soup.prettify())) + + print(("-" * 80)) def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. @@ -104,7 +104,7 @@ """ from lxml import etree for event, element in etree.iterparse(StringIO(data)RefactoringTool: Refactored bs4/element.py , html=html, **kwargs): - print("%s, %4s, %s" % (event, element.tag, element.text)) + print(("%s, %4s, %s" % (event, element.tag, element.text))) class AnnouncingParser(HTMLParser): """Subclass of HTMLParser that announces parse events, without doing @@ -193,9 +193,9 @@ def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" - print("Comparative parser benchmark on Beautiful Soup %s" % __version__) + print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) data = rdoc(num_elements) - print("Generated a large invalid HTML document (%d bytes)." % len(data)) + print(("Generated a large invalid HTML document (%d bytes)." % len(data))) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False @@ -204,24 +204,24 @@ soup = BeautifulSoup(data, parser) b = time.time() success = True - except Exception, e: - print("%s could not parse the markup." % parser) + except Exception as e: + print(("%s could not parse the markup." % parser)) traceback.print_exc() if success: - print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) + print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) from lxml import etree a = time.time() etree.HTML(data) b = time.time() - print("Raw lxml parsed the markup in %.2fs." % (b-a)) + print(("Raw lxml parsed the markup in %.2fs." % (b-a))) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() - print("Raw html5lib parsed the markup in %.2fs." % (b-a)) + print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) def profile(num_elements=100000, parser="lxml"): """Use Python's profiler on a randomly generated document.""" --- bs4/element.py (original) +++ bs4/element.py (refactored) @@ -3,14 +3,14 @@ try: from collections.abc import Callable # Python 3.6 -except ImportError , e: +except ImportError as e: from collections import Callable import re import sys import warnings try: import soupsieve -except ImportError, e: +except ImportError as e: soupsieve = None warnings.warn( 'The soupsieve package is not installed. CSS selectors cannot be used.' @@ -57,22 +57,22 @@ # Source: # https://docs.python.org/3/library/codecs.html#python-specific-encodings PYTHON_SPECIFIC_ENCODINGS = set([ - u"idna", - u"mbcs", - u"oem", - u"palmos", - u"punycode", - u"raw_unicode_escape", - u"undefined", - u"unicode_escape", - u"raw-unicode-escape", - u"unicode-escape", - u"string-escape", - u"string_escape", + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", ]) -class NamespacedAttribute(unicode): +class NamespacedAttribute(str): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. """ @@ -84,18 +84,18 @@ name = None if name is None: - obj = unicode.__new__(cls, prefix) + obj = str.__new__(cls, prefix) elif prefix is None: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = str.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -106,7 +106,7 @@ """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -134,9 +134,9 @@ match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) - - obj = unicode.__new__(cls, original_value) + return str.__new__(str, original_value) + + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -376,7 +376,7 @@ raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) @@ -753,7 +753,7 @@ result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, str): # Optimization to find all tags with a given name. if name.count(':') == 1: # This is a name with a prefix. If this is a namespace-aware document, @@ -872,7 +872,7 @@ return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): """A Python Unicode string that is part of a parse tree. When Beautiful Soup parses the markup penguin, it will @@ -895,10 +895,10 @@ passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - u = unicode.__new__(cls, value) + if isinstance(value, str): + u = str.__new__(cls, value) else: - u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u.setup() return u @@ -909,7 +909,7 @@ return type(self)(self) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -975,30 +975,30 @@ class CData(PreformattedString): """A CDATA block.""" - PREFIX = u'' + PREFIX = '' class ProcessingInstruction(PreformattedString): """A SGML processing instruction.""" - PREFIX = u'' - SUFFIX = u'>' + PREFIX = '' + SUFFIX = '>' class XMLProcessingInstruction(ProcessingInstruction): """An XML processing instruction.""" - PREFIX = u'' - SUFFIX = u'?>' + PREFIX = '' + SUFFIX = '?>' class Comment(PreformattedString): """An HTML or XML comment.""" - PREFIX = u'' + PREFIX = '' class Declaration(PreformattedString): """An XML declaration.""" - PREFIX = u'' - SUFFIX = u'?>' + PREFIX = '' + SUFFIX = '?>' class Doctype(PreformattedString): @@ -1026,8 +1026,8 @@ return Doctype(value) - PREFIX = u'\n' + PREFIX = '\n' class Stylesheet(NavigableString): @@ -1263,7 +1263,7 @@ for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False, + def get_text(self, separator="", strip=False, types=(NavigableString, CData)): """Get all child strings, concatenated using the given separator. @@ -1416,7 +1416,7 @@ def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -1565,8 +1565,8 @@ else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) + elif not isinstance(val, str): + val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None @@ -1575,7 +1575,7 @@ text = formatter.attribute_value(val) decoded = ( - unicode(key) + '=' + str(key) + '=' + formatter.quoted_attribute_value(text)) attrs.append(decoded) close = '' @@ -1934,7 +1934,7 @@ else: attrs = kwargs normalized_attrs = {} - for key, value in attrs.items(): + for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs @@ -1943,7 +1943,7 @@ def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1956,7 +1956,7 @@ new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -1968,7 +1968,7 @@ # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return str(str(value)) def __str__(self): """A human-readable representation of this SoupStrainer.""" @@ -1996,7 +1996,7 @@ markup = markup_name markup_attrs = markup - if isinstance(self.name, basestring): + if isinstance(self.name, str): # Optimization for a very common case where the user is # searching for a tag with one specific name, and we're # looking at a tag with a different name. @@ -2052,7 +2052,7 @@ found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -2065,7 +2065,7 @@ found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -2110,7 +2110,7 @@ return not match_against if (hasattr(match_against, '__iter__') - and not isinstance(match_against, basestring)): + and not isinstance(match_against, str)): # We're asked to match against an iterable of items. # The markup must be match at least one item in the # iterable. We'll try each oRefactoringTool: Refactored bs4/formatter.py RefactoringTool: Refactored bs4/testing.py WARNING: couldn't encode bs4/testing.py's diff for your terminal RefactoringTool: Refactored bs4/builder/__init__.py RefactoringTool: Refactored bs4/builder/_html5lib.py ne in turn. @@ -2137,7 +2137,7 @@ # the tag's name and once against its prefixed name. match = False - if not match and isinstance(match_against, unicode): + if not match and isinstance(match_against, str): # Exact string match match = markup == match_against --- bs4/formatter.py (original) +++ bs4/formatter.py (refactored) @@ -83,7 +83,7 @@ """ if not self.entity_substitution: return ns - from element import NavigableString + from .element import NavigableString if (isinstance(ns, NavigableString) and ns.parent is not None and ns.parent.name in self.cdata_containing_tags): --- bs4/testing.py (original) +++ bs4/testing.py (refactored) @@ -25,7 +25,7 @@ from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder -BAD_DOCUMENT = u"""A bare string +BAD_DOCUMENT = """A bare string
@@ -94,7 +94,7 @@ # Verify that every tag that was opened was eventually closed. # There are no tags in the open tag counter. - assert all(v==0 for v in obj.open_tag_counter.values()) + assert all(v==0 for v in list(obj.open_tag_counter.values())) # The only tag in the tag stack is the one for the root # document. @@ -372,7 +372,7 @@ # process_markup correctly sets processing_instruction_class # even when the markup is already Unicode and there is no # need to process anything. - markup = u"""""" + markup = """""" soup = self.soup(markup) self.assertEqual(markup, soup.decode()) @@ -544,14 +544,14 @@ # "&T" and "&p" look like incomplete character entities, but they are # not. self.assertSoupEquals( - u"• AT&T is in the s&p 500
", - u"\u2022 AT&T is in the s&p 500
" + "• AT&T is in the s&p 500
", + "\u2022 AT&T is in the s&p 500
" ) def test_apos_entity(self): self.assertSoupEquals( - u"Bob's Bar
", - u"Bob's Bar
", + "Bob's Bar
", + "Bob's Bar
", ) def test_entities_in_foreign_document_encoding(self): @@ -564,17 +564,17 @@ # characters. markup = "Hello -☃
" soup = self.soup(markup) --- bs4/builder/__init__.py (original) +++ bs4/builder/__init__.py (refactored) @@ -300,13 +300,13 @@ universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, str): values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice @@ -496,7 +496,7 @@ """ if isinstance(message_or_exception, Exception): e = message_or_exception - message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) + message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) super(ParserRejectedMarkup, self).__init__(message_or_exception) # Builders are registered in reverse order of priority, so that custom --- bs4/builder/_html5lib.py (orRefactoringTool: Refactored bs4/builder/_htmlparser.py iginal) +++ bs4/builder/_html5lib.py (refactored) @@ -33,7 +33,7 @@ # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -79,7 +79,7 @@ parser = html5lib.HTMLParser(tree=self.create_treebuilder) self.underlying_builder.parser = parser extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -87,13 +87,13 @@ doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -110,7 +110,7 @@ def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'%s' % fragment + return '%s' % fragment class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): @@ -217,7 +217,7 @@ rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] - for name, value in element.attrs.items(): + for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): @@ -272,7 +272,7 @@ def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -289,7 +289,7 @@ child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() if (string_child is not None and self.element.contents @@ -302,7 +302,7 @@ old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -340,7 +340,7 @@ self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. --- bs4/builder/_htmlparser.py (original) +++ bs4/builder/_htmlparser.py (refactored) @@ -8,11 +8,11 @@ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from html.parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from html.parser import HTMLParseError +except ImportError as e: # HTMLParseError isRefactoringTool: Refactored bs4/builder/_lxml.py removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -219,14 +219,14 @@ continue try: data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: pass if not data: try: - data = unichr(real_name) - except (ValueError, OverflowError), e: + data = chr(real_name) + except (ValueError, OverflowError) as e: pass - data = data or u"\N{REPLACEMENT CHARACTER}" + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -353,7 +353,7 @@ document to Unicode and parsing it. Each strategy will be tried in turn. """ - if isinstance(markup, unicode): + if isinstance(markup, str): # Parse Unicode as-is. yield (markup, None, None, False) return @@ -376,7 +376,7 @@ try: parser.feed(markup) parser.close() - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e --- bs4/builder/_lxml.py (original) +++ bs4/builder/_lxml.py (refactored) @@ -8,11 +8,11 @@ try: from collections.abc import Callable # Python 3.6 -except ImportError , e: +except ImportError as e: from collections import Callable from io import BytesIO -from StringIO import StringIO +from io import StringIO from lxml import etree from bs4.element import ( Comment, @@ -35,7 +35,7 @@ def _invert(d): "Invert a dictionary." - return dict((v,k) for k, v in d.items()) + return dict((v,k) for k, v in list(d.items())) class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -81,7 +81,7 @@ :param mapping: A dictionary mapping namespace prefixes to URIs. """ - for key, value in mapping.items(): + for key, value in list(mapping.items()): if key and key not in self.soup._namespaces: # Let the BeautifulSoup object know about a new namespace. # If there are multiple namespaces defined with the same @@ -169,12 +169,12 @@ else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -189,7 +189,7 @@ def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -204,7 +204,7 @@ if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(e) def close(self): @@ -233,7 +233,7 @@ # Also treat the namespace mapping as RefactoringTool: No changes to bs4/tests/__init__.py RefactoringTool: No changes to bs4/tests/test_builder_registry.py RefactoringTool: No changes to bs4/tests/test_docs.py RefactoringTool: Refactored bs4/tests/test_html5lib.py RefactoringTool: Refactored bs4/tests/test_htmlparser.py a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -242,7 +242,7 @@ # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -302,7 +302,7 @@ def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'\n%s' % fragment + return '\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -323,10 +323,10 @@ self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(e) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'%s' % fragment + return '%s' % fragment --- bs4/tests/test_html5lib.py (original) +++ bs4/tests/test_html5lib.py (refactored) @@ -5,7 +5,7 @@ try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True -except ImportError, e: +except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( @@ -74,14 +74,14 @@ def test_reparented_markup(self): markup = 'foo
\n' soup = self.soup(markup) - self.assertEqual(u"foo
\n", soup.body.decode()) + self.assertEqual("foo
\n", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_ends_with_whitespace(self): markup = 'foo
\n\n' soup = self.soup(markup) - self.assertEqual(u"foo
\n\n", soup.body.decode()) + self.assertEqual("foo
\n\n", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_containing_identical_whitespace_nodes(self): @@ -127,7 +127,7 @@ def test_foster_parenting(self): markup = b"""A"""
soup = self.soup(markup)
- self.assertEqual(u"A |