#
from bs4 import BeautifulSoupfrom urllib.request import urlopenres = urlopen('http://pythonscraping.com/pages/page1.html')bs = BeautifulSoup(res.read(),'html.parser')print(bs.h1) #获取标签内容
from urllib.request import urlopenfrom urllib.error import HTTPErrorfrom bs4 import BeautifulSoupimport sysdef getTitle(url): try: html = urlopen(url) except HTTPError as e: print(e) return None try: bsObj = BeautifulSoup(html, "html.parser") title = bsObj.body.h1 except AttributeError as e: return None return titletitle = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")if title == None: print("Title could not be found")else: print(title)
from urllib.request import urlopenfrom bs4 import BeautifulSoupimport datetimeimport randomimport rerandom.seed(datetime.datetime.now())#随机数种子 时间种子def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org"+articleUrl) bsObj = BeautifulSoup(html, "html.parser") return bsObj.find("div", { "id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))links = getLinks("/wiki/Kevin_Bacon")while len(links) > 0: newArticle = links[random.randint(0, len(links)-1)].attrs["href"] print(newArticle) links = getLinks(newArticle)
Natural Language Toolkit,自然语言处理工具包,在NLP领域中,最常使用的一个Python库