asda?‰PNG  IHDR ? f ??C1 sRGB ??é gAMA ±? üa pHYs ? ??o¨d GIDATx^íüL”÷e÷Y?a?("Bh?_ò???¢§?q5k?*:t0A-o??¥]VkJ¢M??f?±8\k2íll£1]q?ù???T PKje[/m$$ element.pynu[# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. __license__ = "MIT" try: from collections.abc import Callable # Python 3.6 except ImportError as e: from collections import Callable import re import shlex import sys import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(str): def __new__(cls, prefix, name, namespace=None): if name is None: obj = str.__new__(cls, prefix) elif prefix is None: # Not really namespaced. obj = str.__new__(cls, name) else: obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = str.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return str.__new__(str, original_value) obj = str.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class HTMLAwareEntitySubstitution(EntitySubstitution): """Entity substitution rules that are aware of some HTML quirks. Specifically, the contents of Hello, world! ''' soup = self.soup(html) self.assertEqual("text/javascript", soup.find('script')['type']) def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) # The comment is properly integrated into the tree. foo = soup.find(text="foo") self.assertEqual(comment, foo.next_element) baz = soup.find(text="baz") self.assertEqual(comment, baz.previous_element) def test_preserved_whitespace_in_pre_and_textarea(self): """Whitespace must be preserved in
 and "
        self.assertSoupEquals(pre_markup)
        self.assertSoupEquals(textarea_markup)

        soup = self.soup(pre_markup)
        self.assertEqual(soup.pre.prettify(), pre_markup)

        soup = self.soup(textarea_markup)
        self.assertEqual(soup.textarea.prettify(), textarea_markup)

        soup = self.soup("")
        self.assertEqual(soup.textarea.prettify(), "")

    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
        b_tag = "Inside a B tag"
        self.assertSoupEquals(b_tag)

        nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): """Block elements can be nested.""" soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_correctly_nested_tables(self): """One table can go inside another one.""" markup = ('' '' "') self.assertSoupEquals( markup, '
Here's another table:" '' '' '
foo
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "" "" "
Foo
Bar
Baz
") def test_deeply_nested_multivalued_attribute(self): # html5lib can set the attributes of the same tag many times # as it rearranges the tree. This has caused problems with # multivalued attributes. markup = '
' soup = self.soup(markup) self.assertEqual(["css"], soup.div.div['class']) def test_multivalued_attribute_on_html(self): # html5lib uses a different API to set the attributes ot the # tag. This has caused problems with multivalued # attributes. markup = '' soup = self.soup(markup) self.assertEqual(["a", "b"], soup.html['class']) def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') def test_strings_resembling_character_entity_references(self): # "&T" and "&p" look like incomplete character entities, but they are # not. self.assertSoupEquals( "

• AT&T is in the s&p 500

", "

\\u2022 AT&T is in the s&p 500

" ) def test_entities_in_foreign_document_encoding(self): # “ and ” are invalid numeric entities referencing # Windows-1252 characters. - references a character common # to Windows-1252 and Unicode, and ☃ references a # character only found in Unicode. # # All of these entities should be converted to Unicode # characters. markup = "

“Hello” -☃

" soup = self.soup(markup) self.assertEqual("“Hello” -☃", soup.p.string) def test_entities_in_attributes_converted_to_unicode(self): expect = '

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) def test_quot_entity_converted_to_quotation_mark(self): self.assertSoupEquals("

I said "good day!"

", '

I said "good day!"

') def test_out_of_range_entity(self): expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("

\nfoo

") self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.p.name) self.assertConnectedness(soup) def test_empty_element_tags(self): """Verify consistent handling of empty-element tags, no matter how they come in through the markup. """ self.assertSoupEquals('


', "


") self.assertSoupEquals('


', "


") def test_head_tag_between_head_and_body(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """ foo """ soup = self.soup(content) self.assertNotEqual(None, soup.html.body) self.assertConnectedness(soup) def test_multiple_copies_of_a_tag(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """
""" soup = self.soup(content) self.assertConnectedness(soup.article) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose data.""" markup = b'4' soup = self.soup(markup) self.assertEqual(markup, soup.encode()) html = soup.html self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) self.assertEqual( 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) def test_multivalued_attribute_value_becomes_list(self): markup = b'' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder # to detect any differences between them. # def test_can_parse_unicode_document(self): # A seemingly innocuous document... but it's in Unicode! And # it contains characters that can't be represented in the # encoding found in the declaration! The horror! markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) self.assertEqual('Sacr\xe9 bleu!', soup.body.string) def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('', '') self.assertSoupEquals( 'foo', 'foo') def test_escaped_ampersand_in_attribute_value_is_left_alone(self): self.assertSoupEquals('') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"

\x91Foo\x92

" soup = self.soup(quote) self.assertEqual( soup.p.string, "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("  ") self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'
'
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
            b'
') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") # Some tree builders call it iso8859-8, others call it iso-8859-9. # That's not a difference we really care about. assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) content = parsed_meta['content'] self.assertEqual('text/html; charset=x-sjis', content) # But that value is actually a ContentMetaAttributeValue object. self.assertTrue(isinstance(content, ContentMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_html5_style_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") charset = parsed_meta['charset'] self.assertEqual('x-sjis', charset) # But that value is actually a CharsetMetaAttributeValue object. self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('utf8', charset.encode("utf8")) def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. tree = self.soup("foo") dumped = pickle.dumps(tree, 2) loaded = pickle.loads(dumped) self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), tree.decode()) def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') def test_xml_declaration(self): markup = b"""\n""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) def test_processing_instruction(self): markup = b"""\n""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" Hello. Goodbye. """ soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) def test_nested_namespaces(self): doc = b""" """ soup = self.soup(doc) self.assertEqual(doc, soup.encode()) def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ """ soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) def test_can_parse_unicode_document(self): markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) self.assertEqual('Sacr\xe9 bleu!', soup.root.string) def test_popping_namespaced_tag(self): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) self.assertEqual( str(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") self.assertEqual( soup.encode("latin1"), b'\n') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" markup = (b'\n' + b'0' * (2**12) + b'') soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("

", "

") self.assertSoupEquals("

foo

") def test_namespaces_are_preserved(self): markup = 'This tag is in the a namespaceThis tag is in the b namespace' soup = self.soup(markup) root = soup.root self.assertEqual("http://example.com/", root['xmlns:a']) self.assertEqual("http://example.net/", root['xmlns:b']) def test_closing_namespaced_tag(self): markup = '

20010504

' soup = self.soup(markup) self.assertEqual(str(soup.p), markup) def test_namespaced_attributes(self): markup = '' soup = self.soup(markup) self.assertEqual(str(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = 'bar' soup = self.soup(markup) self.assertEqual(str(soup.foo), markup) def test_find_by_prefixed_name(self): doc = """ foo bar baz """ soup = self.soup(doc) # There are three tags. self.assertEqual(3, len(soup.find_all('tag'))) # But two of them are ns1:tag and one of them is ns2:tag. self.assertEqual(2, len(soup.find_all('ns1:tag'))) self.assertEqual(1, len(soup.find_all('ns2:tag'))) self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) def test_copy_tag_preserves_namespace(self): xml = """ """ soup = self.soup(xml) tag = soup.document duplicate = copy.copy(tag) # The two tags have the same namespace prefix. self.assertEqual(tag.prefix, duplicate.prefix) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" def test_real_xhtml_document(self): # Since XHTML is not HTML5, HTML5 parsers are not tested to handle # XHTML documents in any particular way. pass def test_html_tags_have_namespace(self): markup = "" soup = self.soup(markup) self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) def test_svg_tags_have_namespace(self): markup = '' soup = self.soup(markup) namespace = "http://www.w3.org/2000/svg" self.assertEqual(namespace, soup.svg.namespace) self.assertEqual(namespace, soup.circle.namespace) def test_mathml_tags_have_namespace(self): markup = '5' soup = self.soup(markup) namespace = 'http://www.w3.org/1998/Math/MathML' self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) def test_xml_declaration_becomes_comment(self): markup = '' soup = self.soup(markup) self.assertTrue(isinstance(soup.contents[0], Comment)) self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') self.assertEqual("html", soup.contents[0].next_element.name) def skipIf(condition, reason): def nothing(test, *args, **kwargs): return None def decorator(test_item): if condition: return nothing else: return test_item return decorator PKje[U U tests/test_lxml.pynu["""Tests to ensure that the lxml tree builder generates good trees.""" import re import warnings try: import lxml.etree LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION except ImportError as e: LXML_PRESENT = False LXML_VERSION = (0,) if LXML_PRESENT: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import Comment, Doctype, SoupStrainer from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its tree builder.") class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilder() def test_out_of_range_entity(self): self.assertSoupEquals( "

foo�bar

", "

foobar

") self.assertSoupEquals( "

foo�bar

", "

foobar

") self.assertSoupEquals( "

foo�bar

", "

foobar

") def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into # a string like u'\x93' pass # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. @skipIf( not LXML_PRESENT or LXML_VERSION < (2,3,5,0), "Skipping doctype test for old version of lxml to avoid segfault.") def test_empty_doctype(self): soup = self.soup("") doctype = soup.contents[0] self.assertEqual("", doctype.strip()) def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") self.assertEqual("", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property def default_builder(self): return LXMLTreeBuilderForXML() PKje[/dOdOtests/test_soup.pynu[# -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" from pdb import set_trace import logging import unittest import sys import tempfile from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, ) import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, EncodingDetector, ) from bs4.testing import ( SoupTest, skipIf, ) import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): data = "

éé

" soup = self.soup(data) self.assertEqual("éé", soup.h1.string) def test_embedded_null(self): data = "

foo\0bar

" soup = self.soup(data) self.assertEqual("foo\0bar", soup.h1.string) def test_exclude_encodings(self): utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding) class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) self.assertTrue(v) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("
") msg = str(w[0].message) self._assert_no_parser_specified(msg) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", "html") msg = str(w[0].message) self._assert_no_parser_specified(msg) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", "html.parser") self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: soup = self.soup("", parseOnlyThese=SoupStrainer("b")) msg = str(w[0].message) self.assertTrue("parseOnlyThese" in msg) self.assertTrue("parse_only" in msg) self.assertEqual(b"", soup.encode()) def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding) def test_unrecognized_keyword_argument(self): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) class TestWarnings(SoupTest): def test_disk_file_warning(self): filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name try: with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) msg = str(w[0].message) self.assertTrue("looks like a filename" in msg) finally: filehandle.close() # The file no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) self.assertEqual(0, len(w)) def test_url_warning_with_bytes_url(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/") # Be aware this isn't the only warning that can be raised during # execution.. self.assertTrue(any("looks like a URL" in str(w.message) for w in warning_list)) def test_url_warning_with_unicode_url(self): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning soup = self.soup("http://www.crummyunicode.com/") self.assertTrue(any("looks like a URL" in str(w.message) for w in warning_list)) def test_url_warning_with_bytes_and_space(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) for w in warning_list)) def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup("http://www.crummyuncode.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) for w in warning_list)) class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "NoYesNoYes Yes" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.encode(), b"YesYes Yes") class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. s = "foo\\u2200\N{SNOWMAN}\\u00f5bar" self.assertEqual(self.sub.substitute_html(s), "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we # give them a special test. quotes = b"\x91\x92foo\x93\x94" dammit = UnicodeDammit(quotes) self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) def test_xml_attribute_quoting_normally_uses_double_quotes(self): self.assertEqual(self.sub.substitute_xml("Welcome", True), '"Welcome"') self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), '"Bob\'s Bar"') def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, True), "'Welcome to \"my bar\"'") def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): s = 'Welcome to "Bob\'s Bar"' self.assertEqual( self.sub.substitute_xml(s, True), '"Welcome to "Bob\'s Bar""') def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): quoted = 'Welcome to "Bob\'s Bar"' self.assertEqual(self.sub.substitute_xml(quoted), quoted) def test_xml_quoting_handles_angle_brackets(self): self.assertEqual( self.sub.substitute_xml("foo"), "foo<bar>") def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), "&Aacute;T&T") def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' self.assertEqual(self.sub.substitute_html(text), text) class TestEncodingConversion(SoupTest): # Test Beautiful Soup's ability to decode and encode from various # encodings. def setUp(self): super(TestEncodingConversion, self).setUp() self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, b'Sacr\xc3\xa9 bleu!') def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) @skipIf( PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): markup = '
' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" def test_unicode_input(self): markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup) def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup) self.assertEqual( dammit.unicode_markup, "\\u2018\\u2019\\u201c\\u201d") def test_smart_quotes_to_xml_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_html_entities(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual( dammit.unicode_markup, "‘’“”") def test_smart_quotes_to_ascii(self): markup = b"\x91\x92\x93\x94" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual( dammit.unicode_markup, """''""""") def test_detect_utf8(self): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, '\\u05dd\\u05d5\\u05dc\\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_exclude_encodings(self): # This is UTF-8. utf8_data = "Räksmörgås".encode("utf-8") # But if we exclude UTF-8 from consideration, the guess is # Windows-1252. dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') # And if we exclude that, there is no valid guess at all. dammit = UnicodeDammit( utf8_data, exclude_encodings=["utf-8", "windows-1252"]) self.assertEqual(dammit.original_encoding, None) def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): detected = EncodingDetector( b'') encodings = list(detected.encodings) assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings def test_detect_html5_style_meta_tag(self): for data in ( b'', b"", b"", b""): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding) def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue("\\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual("áé", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual( "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none(self): a = NamespacedAttribute("xmlns", None) self.assertEqual(a, "xmlns") def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) def test_attributes_are_equivalent_if_prefix_and_name_identical(self): a = NamespacedAttribute("a", "b", "c") b = NamespacedAttribute("a", "b", "c") self.assertEqual(a, b) # The actual namespace is not considered. c = NamespacedAttribute("a", "b", None) self.assertEqual(a, c) # But name and prefix are important. d = NamespacedAttribute("a", "z", "c") self.assertNotEqual(a, d) e = NamespacedAttribute("z", "b", "c") self.assertNotEqual(a, e) class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): def test_content_meta_attribute_value(self): value = CharsetMetaAttributeValue("euc-jp") self.assertEqual("euc-jp", value) self.assertEqual("euc-jp", value.original_value) self.assertEqual("utf8", value.encode("utf8")) def test_content_meta_attribute_value(self): value = ContentMetaAttributeValue("text/html; charset=euc-jp") self.assertEqual("text/html; charset=euc-jp", value) self.assertEqual("text/html; charset=euc-jp", value.original_value) self.assertEqual("text/html; charset=utf8", value.encode("utf8")) PKje[ރ88tests/test_tree.pynu[ # -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ from pdb import set_trace import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( PY3K, CData, Comment, Declaration, Doctype, NavigableString, SoupStrainer, Tag, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup('

Räksmörgås

') self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') def test_unicode_attribute_find(self): soup = self.soup('

here it is

') str(soup) self.assertEqual("here it is", soup.find(id='Räksmörgås').text) def test_find_everything(self): """Test an optimization that finds all tags.""" soup = self.soup("foobar") self.assertEqual(2, len(soup.find_all())) def test_find_everything_with_name(self): """Test an optimization that finds all tags with a given name.""" soup = self.soup("foobarbaz") self.assertEqual(2, len(soup.find_all('a'))) class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" def test_find_all_text_nodes(self): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. self.assertEqual(soup.find_all(string="bar"), ["bar"]) self.assertEqual(soup.find_all(text="bar"), ["bar"]) # Match any of a number of strings. self.assertEqual( soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), ["Foo", "bar", '\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), ["Foo", "bar", '\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" soup = self.soup("12345") self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) self.assertSelects(soup.find_all('a', limit=1), ["1"]) self.assertSelects( soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) # A limit of 0 means no limit. self.assertSelects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) def test_calling_a_tag_is_calling_findall(self): soup = self.soup("123") self.assertSelects(soup('a', limit=1), ["1"]) self.assertSelects(soup.b(id="foo"), ["3"]) def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): soup = self.soup("") # Create a self-referential list. l = [] l.append(l) # Without special code in _normalize_search_value, this would cause infinite # recursion. self.assertEqual([], soup.find_all(l)) def test_find_all_resultset(self): """All find_all calls return a ResultSet""" soup = self.soup("") result = soup.find_all("a") self.assertTrue(hasattr(result, "source")) result = soup.find_all(True) self.assertTrue(hasattr(result, "source")) result = soup.find_all(text="foo") self.assertTrue(hasattr(result, "source")) class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('4') self.assertEqual("4", soup.find("mathml:msqrt").string) self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" def setUp(self): super(TreeTest, self).setUp() self.tree = self.soup("""First tag. Second tag. Third Nested tag. tag.""") def test_find_all_by_tag_name(self): # Find all the tags. self.assertSelects( self.tree.find_all('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_name_and_text(self): self.assertSelects( self.tree.find_all('a', text='First tag.'), ['First tag.']) self.assertSelects( self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) self.assertSelects( self.tree.find_all('a', text=re.compile("tag")), ['First tag.', 'Nested tag.']) def test_find_all_on_non_root_element(self): # You can call find_all on any node, not just the root. self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) def test_calling_element_invokes_find_all(self): self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_strainer(self): self.assertSelects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.']) def test_find_all_by_tag_names(self): self.assertSelects( self.tree.find_all(['a', 'b']), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_dict(self): self.assertSelects( self.tree.find_all({'a' : True, 'b' : True}), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_by_tag_re(self): self.assertSelects( self.tree.find_all(re.compile('^[ab]$')), ['First tag.', 'Second tag.', 'Nested tag.']) def test_find_all_with_tags_matching_method(self): # You can define an oracle method that determines whether # a tag matches the search. def id_matches_name(tag): return tag.name == tag.get('id') tree = self.soup("""Match 1. Does not match. Match 2.""") self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) def test_find_with_multi_valued_attribute(self): soup = self.soup( "
1
2
3
" ) r1 = soup.find('div', 'a d'); r2 = soup.find('div', re.compile(r'a d')); r3, r4 = soup.find_all('div', ['a b', 'a d']); self.assertEqual('3', r1.string) self.assertEqual('3', r2.string) self.assertEqual('1', r3.string) self.assertEqual('3', r4.string) class TestFindAllByAttribute(TreeTest): def test_find_all_by_attribute_name(self): # You can pass in keyword arguments to find_all to search by # attribute. tree = self.soup(""" Matching a. Non-matching Matching b.a. """) self.assertSelects(tree.find_all(id='first'), ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): peace = "םולש".encode("utf8") data = ''.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) def test_find_all_by_attribute_dict(self): # You can pass in a dictionary as the argument 'attrs'. This # lets you search for attributes like 'name' (a fixed argument # to find_all) and 'class' (a reserved word in Python.) tree = self.soup(""" Name match. Class match. Non-match. A tag called 'name1'. """) # This doesn't do what you want. self.assertSelects(tree.find_all(name='name1'), ["A tag called 'name1'."]) # This does what you want. self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) # Passing in the class_ keyword argument will search against # the 'class' attribute. self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) # Passing in a string to 'attrs' will also search the CSS class. self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") f = tree.find_all("gar", class_=re.compile("o")) self.assertSelects(f, ["Found it"]) f = tree.find_all("gar", class_=re.compile("a")) self.assertSelects(f, ["Found it"]) # If the search fails to match the individual strings "foo" and "bar", # it will be tried against the combined string "foo bar". f = tree.find_all("gar", class_=re.compile("o b")) self.assertSelects(f, ["Found it"]) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) def big_attribute_value(value): return len(value) > 3 self.assertSelects(soup.find_all("a", big_attribute_value), []) def small_attribute_value(value): return len(value) <= 3 self.assertSelects( soup.find_all("a", small_attribute_value), ["Found it"]) def test_find_all_with_string_for_attrs_finds_multiple_classes(self): soup = self.soup('') a, a2 = soup.find_all("a") self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) # If you specify the class as a string that contains a # space, only that specific value will be found. self.assertEqual([a], soup.find_all("a", class_="foo bar")) self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) def test_find_all_by_attribute_soupstrainer(self): tree = self.soup(""" Match. Non-match.""") strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) def test_find_all_with_missing_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) def test_find_all_with_defined_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that have that attribute set to any value. tree = self.soup("""ID present. No ID present. ID is empty.""") self.assertSelects( tree.find_all(id=True), ["ID present.", "ID is empty."]) def test_find_all_with_numeric_attribute(self): # If you search for a number, it's treated as a string. tree = self.soup("""Unquoted attribute. Quoted attribute.""") expected = ["Unquoted attribute.", "Quoted attribute."] self.assertSelects(tree.find_all(id=1), expected) self.assertSelects(tree.find_all(id="1"), expected) def test_find_all_with_list_attribute_values(self): # You can pass a list of attribute values instead of just one, # and you'll get tags that match any of the values. tree = self.soup("""1 2 3 No ID.""") self.assertSelects(tree.find_all(id=["1", "3", "4"]), ["1", "3"]) def test_find_all_with_regular_expression_attribute_value(self): # You can pass a regular expression as an attribute value, and # you'll get tags whose values for that attribute match the # regular expression. tree = self.soup("""One a. Two as. Mixed as and bs. One b. No ID.""") self.assertSelects(tree.find_all(id=re.compile("^a+$")), ["One a.", "Two as."]) def test_find_by_name_and_containing_string(self): soup = self.soup("foobarfoo") a = soup.a self.assertEqual([a], soup.find_all("a", text="foo")) self.assertEqual([], soup.find_all("a", text="bar")) self.assertEqual([], soup.find_all("a", text="bar")) def test_find_by_name_and_containing_string_when_string_is_buried(self): soup = self.soup("foofoo") self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) def test_find_by_attribute_and_containing_string(self): soup = self.soup('foofoo') a = soup.a self.assertEqual([a], soup.find_all(id=2, text="foo")) self.assertEqual([], soup.find_all(id=1, text="bar")) class TestIndex(TreeTest): """Test Tag.index""" def test_index(self): tree = self.soup("""
Identical Not identical Identical Identical with child Also not identical Identical with child
""") div = tree.div for i, element in enumerate(div.contents): self.assertEqual(i, div.index(element)) self.assertRaises(ValueError, tree.index, 1) class TestParentOperations(TreeTest): """Test navigation and searching through an element's parents.""" def setUp(self): super(TestParentOperations, self).setUp() self.tree = self.soup('''