asda?‰PNG
IHDR ? f ??C1 sRGB ??é gAMA ±?üa pHYs ? ??o¨d GIDATx^íüL”÷e÷Y?a?("Bh?_ò???¢§?q5k?*:t0A-o??¥]VkJ¢M??f?±8\k2íll£1]q?ù???T
PK Age[wsK' ' __main__.pynu [ from html2text.cli import main
main()
PK Age[t= $ $ cli.pynu [ import argparse
import sys
from html2text import HTML2Text, __version__, config
def main():
baseurl = ""
class bcolors:
HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
p = argparse.ArgumentParser()
p.add_argument(
"--default-image-alt",
dest="default_image_alt",
default=config.DEFAULT_IMAGE_ALT,
help="The default alt string for images with missing ones",
)
p.add_argument(
"--pad-tables",
dest="pad_tables",
action="store_true",
default=config.PAD_TABLES,
help="pad the cells to equal column width in tables",
)
p.add_argument(
"--no-wrap-links",
dest="wrap_links",
action="store_false",
default=config.WRAP_LINKS,
help="don't wrap links during conversion",
)
p.add_argument(
"--wrap-list-items",
dest="wrap_list_items",
action="store_true",
default=config.WRAP_LIST_ITEMS,
help="wrap list items during conversion",
)
p.add_argument(
"--ignore-emphasis",
dest="ignore_emphasis",
action="store_true",
default=config.IGNORE_EMPHASIS,
help="don't include any formatting for emphasis",
)
p.add_argument(
"--reference-links",
dest="inline_links",
action="store_false",
default=config.INLINE_LINKS,
help="use reference style links instead of inline links",
)
p.add_argument(
"--ignore-links",
dest="ignore_links",
action="store_true",
default=config.IGNORE_ANCHORS,
help="don't include any formatting for links",
)
p.add_argument(
"--protect-links",
dest="protect_links",
action="store_true",
default=config.PROTECT_LINKS,
help="protect links from line breaks surrounding them with angle brackets",
)
p.add_argument(
"--ignore-images",
dest="ignore_images",
action="store_true",
default=config.IGNORE_IMAGES,
help="don't include any formatting for images",
)
p.add_argument(
"--images-as-html",
dest="images_as_html",
action="store_true",
default=config.IMAGES_AS_HTML,
help=(
"Always write image tags as raw html; preserves `height`, `width` and "
"`alt` if possible."
),
)
p.add_argument(
"--images-to-alt",
dest="images_to_alt",
action="store_true",
default=config.IMAGES_TO_ALT,
help="Discard image data, only keep alt text",
)
p.add_argument(
"--images-with-size",
dest="images_with_size",
action="store_true",
default=config.IMAGES_WITH_SIZE,
help=(
"Write image tags with height and width attrs as raw html to retain "
"dimensions"
),
)
p.add_argument(
"-g",
"--google-doc",
action="store_true",
dest="google_doc",
default=False,
help="convert an html-exported Google Document",
)
p.add_argument(
"-d",
"--dash-unordered-list",
action="store_true",
dest="ul_style_dash",
default=False,
help="use a dash rather than a star for unordered list items",
)
p.add_argument(
"-e",
"--asterisk-emphasis",
action="store_true",
dest="em_style_asterisk",
default=False,
help="use an asterisk rather than an underscore for emphasized text",
)
p.add_argument(
"-b",
"--body-width",
dest="body_width",
type=int,
default=config.BODY_WIDTH,
help="number of characters per output line, 0 for no wrap",
)
p.add_argument(
"-i",
"--google-list-indent",
dest="list_indent",
type=int,
default=config.GOOGLE_LIST_INDENT,
help="number of pixels Google indents nested lists",
)
p.add_argument(
"-s",
"--hide-strikethrough",
action="store_true",
dest="hide_strikethrough",
default=False,
help="hide strike-through text. only relevant when -g is " "specified as well",
)
p.add_argument(
"--escape-all",
action="store_true",
dest="escape_snob",
default=False,
help=(
"Escape all special characters. Output is less readable, but avoids "
"corner case formatting issues."
),
)
p.add_argument(
"--bypass-tables",
action="store_true",
dest="bypass_tables",
default=config.BYPASS_TABLES,
help="Format tables in HTML rather than Markdown syntax.",
)
p.add_argument(
"--ignore-tables",
action="store_true",
dest="ignore_tables",
default=config.IGNORE_TABLES,
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
)
p.add_argument(
"--single-line-break",
action="store_true",
dest="single_line_break",
default=config.SINGLE_LINE_BREAK,
help=(
"Use a single line break after a block element rather than two line "
"breaks. NOTE: Requires --body-width=0"
),
)
p.add_argument(
"--unicode-snob",
action="store_true",
dest="unicode_snob",
default=config.UNICODE_SNOB,
help="Use unicode throughout document",
)
p.add_argument(
"--no-automatic-links",
action="store_false",
dest="use_automatic_links",
default=config.USE_AUTOMATIC_LINKS,
help="Do not use automatic links wherever applicable",
)
p.add_argument(
"--no-skip-internal-links",
action="store_false",
dest="skip_internal_links",
default=config.SKIP_INTERNAL_LINKS,
help="Do not skip internal links",
)
p.add_argument(
"--links-after-para",
action="store_true",
dest="links_each_paragraph",
default=config.LINKS_EACH_PARAGRAPH,
help="Put links after each paragraph instead of document",
)
p.add_argument(
"--mark-code",
action="store_true",
dest="mark_code",
default=config.MARK_CODE,
help="Mark program code blocks with [code]...[/code]",
)
p.add_argument(
"--decode-errors",
dest="decode_errors",
default=config.DECODE_ERRORS,
help=(
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
"acceptable values"
),
)
p.add_argument(
"--open-quote",
dest="open_quote",
default=config.OPEN_QUOTE,
help="The character used to open quotes",
)
p.add_argument(
"--close-quote",
dest="close_quote",
default=config.CLOSE_QUOTE,
help="The character used to close quotes",
)
p.add_argument(
"--version", action="version", version=".".join(map(str, __version__))
)
p.add_argument("filename", nargs="?")
p.add_argument("encoding", nargs="?", default="utf-8")
args = p.parse_args()
if args.filename and args.filename != "-":
with open(args.filename, "rb") as fp:
data = fp.read()
else:
data = sys.stdin.buffer.read()
try:
data = data.decode(args.encoding, args.decode_errors)
except UnicodeDecodeError as err:
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
warning += " Use the " + bcolors.OKGREEN
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
print(warning)
raise err
h = HTML2Text(baseurl=baseurl)
# handle options
if args.ul_style_dash:
h.ul_item_mark = "-"
if args.em_style_asterisk:
h.emphasis_mark = "*"
h.strong_mark = "__"
h.body_width = args.body_width
h.google_list_indent = args.list_indent
h.ignore_emphasis = args.ignore_emphasis
h.ignore_links = args.ignore_links
h.protect_links = args.protect_links
h.ignore_images = args.ignore_images
h.images_as_html = args.images_as_html
h.images_to_alt = args.images_to_alt
h.images_with_size = args.images_with_size
h.google_doc = args.google_doc
h.hide_strikethrough = args.hide_strikethrough
h.escape_snob = args.escape_snob
h.bypass_tables = args.bypass_tables
h.ignore_tables = args.ignore_tables
h.single_line_break = args.single_line_break
h.inline_links = args.inline_links
h.unicode_snob = args.unicode_snob
h.use_automatic_links = args.use_automatic_links
h.skip_internal_links = args.skip_internal_links
h.links_each_paragraph = args.links_each_paragraph
h.mark_code = args.mark_code
h.wrap_links = args.wrap_links
h.wrap_list_items = args.wrap_list_items
h.pad_tables = args.pad_tables
h.default_image_alt = args.default_image_alt
h.open_quote = args.open_quote
h.close_quote = args.close_quote
sys.stdout.write(h.handle(data))
PK Age[S,( ( utils.pynu [ import html.entities
from html2text import config
unifiable_n = {
html.entities.name2codepoint[k]: v
for k, v in config.UNIFIABLE.items()
if k != "nbsp"
}
def hn(tag):
if tag[0] == "h" and len(tag) == 2:
n = tag[1]
if "0" < n <= "9":
return int(n)
return 0
def dumb_property_dict(style):
"""
:returns: A hash of css attributes
"""
return {
x.strip().lower(): y.strip().lower()
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
}
def dumb_css_parser(data):
"""
:type data: str
:returns: A hash of css selectors, each of which contains a hash of
css attributes.
:rtype: dict
"""
# remove @import sentences
data += ";"
importIndex = data.find("@import")
while importIndex != -1:
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
importIndex = data.find("@import")
# parse the css. reverted from dictionary comprehension in order to
# support older pythons
elements = [x.split("{") for x in data.split("}") if "{" in x.strip()]
try:
elements = {a.strip(): dumb_property_dict(b) for a, b in elements}
except ValueError:
elements = {} # not that important
return elements
def element_style(attrs, style_def, parent_style):
"""
:type attrs: dict
:type style_def: dict
:type style_def: dict
:returns: A hash of the 'final' style attributes of the element
:rtype: dict
"""
style = parent_style.copy()
if "class" in attrs:
for css_class in attrs["class"].split():
css_style = style_def.get("." + css_class, {})
style.update(css_style)
if "style" in attrs:
immediate_style = dumb_property_dict(attrs["style"])
style.update(immediate_style)
return style
def google_list_style(style):
"""
Finds out whether this is an ordered or unordered list
:type style: dict
:rtype: str
"""
if "list-style-type" in style:
list_style = style["list-style-type"]
if list_style in ["disc", "circle", "square", "none"]:
return "ul"
return "ol"
def google_has_height(style):
"""
Check if the style of the element has the 'height' attribute
explicitly defined
:type style: dict
:rtype: bool
"""
return "height" in style
def google_text_emphasis(style):
"""
:type style: dict
:returns: A list of all emphasis modifiers of the element
:rtype: list
"""
emphasis = []
if "text-decoration" in style:
emphasis.append(style["text-decoration"])
if "font-style" in style:
emphasis.append(style["font-style"])
if "font-weight" in style:
emphasis.append(style["font-weight"])
return emphasis
def google_fixed_width_font(style):
"""
Check if the css of the current element defines a fixed width font
:type style: dict
:rtype: bool
"""
font_family = ""
if "font-family" in style:
font_family = style["font-family"]
return "courier new" == font_family or "consolas" == font_family
def list_numbering_start(attrs):
"""
Extract numbering from list element attributes
:type attrs: dict
:rtype: int or None
"""
if "start" in attrs:
try:
return int(attrs["start"]) - 1
except ValueError:
pass
return 0
def skipwrap(para, wrap_links, wrap_list_items):
# If it appears to contain a link
# don't wrap
if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
return True
# If the text begins with four spaces or one tab, it's a code block;
# don't wrap
if para[0:4] == " " or para[0] == "\t":
return True
# If the text begins with only two "--", possibly preceded by
# whitespace, that's an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists,
# but there's a
-inside- case in one of the tests that
# also depends upon it.
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
return not wrap_list_items
# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
return bool(
config.RE_ORDERED_LIST_MATCHER.match(stripped)
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
)
def escape_md(text):
"""
Escapes markdown-sensitive characters within other markdown
constructs.
"""
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
def escape_md_section(text, snob=False):
"""
Escapes markdown-sensitive characters across whole document sections.
"""
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
if snob:
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
max_cols = len(max_width)
for line in lines:
cols = [x.rstrip() for x in line.split("|")]
num_cols = len(cols)
# don't drop any data if colspan attributes result in unequal lengths
if num_cols < max_cols:
cols += [""] * (max_cols - num_cols)
elif max_cols < num_cols:
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
max_cols = num_cols
max_width = [
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split("|")]
if set(line.strip()) == set("-|"):
filler = "-"
new_cols = [
x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)
]
else:
filler = " "
new_cols = [
x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)
]
new_lines.append("|".join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split("\n")
table_buffer, table_started = [], False
new_lines = []
for line in lines:
# Toggle table started
if config.TABLE_MARKER_FOR_PAD in line:
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append("")
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
return "\n".join(new_lines)
PK Age[II $ __pycache__/cli.cpython-36.opt-1.pycnu [ 3
v:U] $ @ s0 d dl Z d dlZd dlmZmZmZ dd ZdS ) N) HTML2Text__version__configc C s d} G dd d}t j }|jddtjdd |jdd d
tjdd |jd
ddtjdd |jddd
tjdd |jddd
tjdd |jdddtj dd |jddd
tj
dd |jddd
tjdd |jd d!d
tjd"d |jd#d$d
tj
d%d |jd&d'd
tjd(d |jd)d*d
tjd+d |jd,d-d
d.d/d0d1 |jd2d3d
d4d/d5d1 |jd6d7d
d8d/d9d1 |jd:d;d |jd?d@dAttjdBd> |jdCdDd
dEd/dFd1 |jdGd
dHd/dId1 |jdJd
dKtjdLd1 |jdMd
dNtjdOd1 |jdPd
dQtjdRd1 |jdSd
dTtjdUd1 |jdVddWtjdXd1 |jdYddZtjd[d1 |jd\d
d]tjd^d1 |jd_d
d`tjdad1 |jdbdctjddd |jdedftjdgd |jdhditjdjd |jdkdldmjtt t!dn |jdodpdq |jdrdpdsdt |j" }|j#r |j#dukr t$|j#dv}|j% }W d Q R X nt&j'j(j% }y|j)|j*|j+}W n` t,k
r } zB|j-dw |j. }|dx|j/ 7 }|dy|j. dz 7 }t0| |W Y d d }~X nX t1| d{}|j2rdu|_3|j4rd||_5d}|_6|j7|_7|j8|_9|j:|_:|j;|_;|j<|_<|j=|_=|j>|_>|j?|_?|j@|_@|jA|_A|jB|_B|jC|_C|jD|_D|jE|_E|jF|_F|jG|_G|jH|_H|jI|_I|jJ|_J|jK|_K|jL|_L|jM|_M|jN|_N|jO|_O|jP|_P|jQ|_Q|jR|_Rt&jSjT|jU| d S )~N c @ s, e Zd ZdZdZdZdZdZdZdZ dZ
d S )
zmain..bcolorsz[95mz[94mz[92mz[93mz[91mz[0mz[1mz[4mN)__name__
__module____qualname__ZHEADERZOKBLUEOKGREENWARNINGZFAILENDCZBOLDZ UNDERLINE r r /usr/lib/python3.6/cli.pybcolors
s r z--default-image-altdefault_image_altz3The default alt string for images with missing ones)destdefaulthelpz--pad-tables
pad_tables
store_truez-pad the cells to equal column width in tables)r actionr r z--no-wrap-links
wrap_linksZstore_falsez"don't wrap links during conversionz--wrap-list-itemswrap_list_itemsz!wrap list items during conversionz--ignore-emphasisignore_emphasisz)don't include any formatting for emphasisz--reference-linksinline_linksz1use reference style links instead of inline linksz--ignore-linksignore_linksz&don't include any formatting for linksz--protect-links
protect_linkszCprotect links from line breaks surrounding them with angle bracketsz--ignore-images
ignore_imagesz'don't include any formatting for imagesz--images-as-htmlimages_as_htmlzWAlways write image tags as raw html; preserves `height`, `width` and `alt` if possible.z--images-to-alt
images_to_altz&Discard image data, only keep alt textz--images-with-sizeimages_with_sizezMWrite image tags with height and width attrs as raw html to retain dimensionsz-gz--google-doc
google_docFz(convert an html-exported Google Document)r r r r z-dz--dash-unordered-list
ul_style_dashz6use a dash rather than a star for unordered list itemsz-ez--asterisk-emphasisem_style_asteriskz=use an asterisk rather than an underscore for emphasized textz-bz--body-width
body_widthz3number of characters per output line, 0 for no wrap)r typer r z-iz--google-list-indentlist_indentz,number of pixels Google indents nested listsz-sz--hide-strikethroughhide_strikethroughzDhide strike-through text. only relevant when -g is specified as wellz--escape-allescape_snobzbEscape all special characters. Output is less readable, but avoids corner case formatting issues.z--bypass-tables
bypass_tablesz2Format tables in HTML rather than Markdown syntax.z--ignore-tables
ignore_tableszAIgnore table-related tags (table, th, td, tr) while keeping rows.z--single-line-breaksingle_line_breakzhUse a single line break after a block element rather than two line breaks. NOTE: Requires --body-width=0z--unicode-snobunicode_snobzUse unicode throughout documentz--no-automatic-linksuse_automatic_linksz.Do not use automatic links wherever applicablez--no-skip-internal-linksskip_internal_linkszDo not skip internal linksz--links-after-paralinks_each_paragraphz2Put links after each paragraph instead of documentz--mark-code mark_codez.Mark program code blocks with [code]...[/code]z--decode-errors
decode_errorszZWhat to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable valuesz--open-quote
open_quotez!The character used to open quotesz
--close-quoteclose_quotez"The character used to close quotesz --versionversion.)r r3 filename?)nargsencodingzutf-8)r7 r -rbzWarning:z Use the z--decode-errors=ignorez flag.)baseurl*__)VargparseArgumentParseradd_argumentr ZDEFAULT_IMAGE_ALTZ
PAD_TABLESZ
WRAP_LINKSZWRAP_LIST_ITEMSZIGNORE_EMPHASISZINLINE_LINKSZIGNORE_ANCHORSZ
PROTECT_LINKSZ
IGNORE_IMAGESZIMAGES_AS_HTMLZ
IMAGES_TO_ALTZIMAGES_WITH_SIZEintZ
BODY_WIDTHZGOOGLE_LIST_INDENTZ
BYPASS_TABLESZ
IGNORE_TABLESZSINGLE_LINE_BREAKZUNICODE_SNOBZUSE_AUTOMATIC_LINKSZSKIP_INTERNAL_LINKSZLINKS_EACH_PARAGRAPHZ MARK_CODEZ
DECODE_ERRORSZ
OPEN_QUOTEZCLOSE_QUOTEjoinmapstrr
parse_argsr5 openreadsysstdinbufferdecoder8 r0 UnicodeDecodeErrorr
r r printr r! Zul_item_markr" Z
emphasis_markZstrong_markr# r% Zgoogle_list_indentr r r r r r r r r&