ÿØÿà JFIF ` ` ÿþxØ
| Server IP : 109.234.164.53 / Your IP : 216.73.216.110 Web Server : Apache System : Linux cervelle.o2switch.net 4.18.0-553.32.1.lve.el8.x86_64 #1 SMP Thu Dec 19 13:14:03 UTC 2024 x86_64 User : computer3 ( 1098) PHP Version : 7.1.33 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : OFF | Pkexec : OFF Directory : /opt/alt/python37/share/doc/alt-python37-pyparsing-doc/examples/ |
Upload File : |
# URL extractor
# Copyright 2004, Paul McGuire
from pyparsing import makeHTMLTags, pyparsing_common as ppc
from urllib.request import urlopen
import pprint
linkOpenTag, linkCloseTag = makeHTMLTags("a")
linkBody = linkOpenTag.tag_body
linkBody.setParseAction(ppc.stripHTMLTags)
linkBody.addParseAction(lambda toks: " ".join(toks[0].strip().split()))
link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()
# Go get some HTML with some links in it.
with urlopen("https://www.cnn.com/") as serverListPage:
htmlText = serverListPage.read().decode("UTF-8")
# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks, strt, end in link.scanString(htmlText):
print(toks.asList())
# Create dictionary from list comprehension, assembled from each pair of tokens returned
# from a matched URL.
pprint.pprint({toks.body: toks.href for toks, strt, end in link.scanString(htmlText)})