This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| docu:csheet:sysadm:script:python:html_scraping [2022/01/05 14:05] – admin | docu:csheet:sysadm:script:python:html_scraping [2022/01/16 02:13] (current) – clearer examples on attributes admin | ||
|---|---|---|---|
| Line 21: | Line 21: | ||
| response = requests.get(' | response = requests.get(' | ||
| html = BeautifulSoup(response.text, | html = BeautifulSoup(response.text, | ||
| + | |||
| + | # get page title | ||
| + | print(html.title) | ||
| # select using DOM selector (list of elements) | # select using DOM selector (list of elements) | ||
| - | elements = html.select(' | + | elements = html.select(' |
| # examples on findings | # examples on findings | ||
| if len(elements) > 0: | if len(elements) > 0: | ||
| - | # get " | ||
| - | print(elements[0].get(' | ||
| # get " | # get " | ||
| print(elements[0].get(' | print(elements[0].get(' | ||
| + | print(elements[0].get(' | ||
| - | # get class | + | # or get using dictionary: |
| print(elements[0][' | print(elements[0][' | ||
| + | print(elements[0][' | ||
| # get text of DOM | # get text of DOM | ||
| print(elements[0].get_text()) | print(elements[0].get_text()) | ||
| + | print(elements[0].string) | ||
| </ | </ | ||
| + | |||
| + | Documentation on BeautifulSoup: | ||