nlp basics beautiful soup
import nltk
import re
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
content1 = request.urlopen(url).read()
urlA = "http://www.bbc.com/news/health-42802191"
html_content = request.urlopen(urlA).read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
inner_body = soup.find_all('div', attrs={'class': 'story-body__inner'})
inner_text = [elm.text for elm in inner_body[0].find_all(['h1', 'h2', 'p', 'li'])]
text_content2 = '\n'.join(inner_text)
text_content1 = content1.decode('unicode_escape') # Converts bytes to unicode
tokens1 = nltk.word_tokenize(text_content1)
tokens1[3:8]
tokens2 = nltk.word_tokenize(text_content2)
tokens2[:5]
len(tokens2)
tokens2_2 = re.findall(r'\w+', text_content2)
len(tokens2_2)
pattern = r'\w+'
tokens2_3 = nltk.regexp_tokenize(text_content2, pattern)
len(tokens2_3)
input_text2 = nltk.Text(tokens2)
type(input_text2)
#Read the html content from the url link 'https://en.wikipedia.org/wiki/Python_(programming_language)'. Store the content in variable html_content.
html_content1=request.urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)').read()
#Create a BeautifulSoup object with html_content and html.parser. Store the result in variable soup.
soup1=BeautifulSoup(html_content1,'html.parser')
#Find the number of reference links present in soup object. Store the result in variable n_links.
#Hint : Make use of find_all method and look of a tags.
n_links=len(soup1.find_all('a'))
#print n_links.
print(n_links)
#############Task2#########################
#Find the table from soup object, having class attribute value wikitable. Store the result in variable table.
#Hint: Make use of find method associated with soup object.
vartable=soup1.find_all('table', attrs={'class': 'wikitable'})
#Find all rows of table. Store the result in rows.
#Hint: Make use of find_all method on table and look for tr tags.
rows=vartable[0].find_all('tr')
print(type(rows))
print(rows)
#Ignore the first row of rows, with expression rows = rows[1:]
rows=rows[1:]
#For every row perform the following tasks.
#Find all columns associated with a row. Hint: Make use of find_all on each row and look for td tags.
#Select only the column from the obtained list of columns.
#print the text associated with first column. Hint: Make use of get_text method on the obatined column.
for r in rows:
print(r.find_all('td')[0].get_text())
#How many number of words are obtained when the sentence Python is cool!!! is tokenized into words, with regular expression r'\w+'
exp=r'\w+'
sent="Python is cool!!!"
print(len(nltk.regexp_tokenize(sent, exp)))
print(len(nltk.sent_tokenize(sent)))
Comments
Post a Comment