nlp basics beautiful soup

January 05, 2019

import nltk import re from urllib import request url = "http://www.gutenberg.org/files/2554/2554-0.txt" content1 = request.urlopen(url).read() urlA = "http://www.bbc.com/news/health-42802191" html_content = request.urlopen(urlA).read() from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') inner_body = soup.find_all('div', attrs={'class': 'story-body__inner'}) inner_text = [elm.text for elm in inner_body[0].find_all(['h1', 'h2', 'p', 'li'])] text_content2 = '\n'.join(inner_text) text_content1 = content1.decode('unicode_escape') # Converts bytes to unicode tokens1 = nltk.word_tokenize(text_content1) tokens1[3:8] tokens2 = nltk.word_tokenize(text_content2) tokens2[:5] len(tokens2) tokens2_2 = re.findall(r'\w+', text_content2) len(tokens2_2) pattern = r'\w+' tokens2_3 = nltk.regexp_tokenize(text_content2, pattern) len(tokens2_3) input_text2 = nltk.Text(tokens2) type(input_text2) #Read the html content from the url link 'https://en.wikipedia.org/wiki/Python_(programming_language)'. Store the content in variable html_content. html_content1=request.urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)').read() #Create a BeautifulSoup object with html_content and html.parser. Store the result in variable soup. soup1=BeautifulSoup(html_content1,'html.parser') #Find the number of reference links present in soup object. Store the result in variable n_links. #Hint : Make use of find_all method and look of a tags. n_links=len(soup1.find_all('a')) #print n_links. print(n_links) #############Task2######################### #Find the table from soup object, having class attribute value wikitable. Store the result in variable table. #Hint: Make use of find method associated with soup object. vartable=soup1.find_all('table', attrs={'class': 'wikitable'}) #Find all rows of table. Store the result in rows. #Hint: Make use of find_all method on table and look for tr tags. rows=vartable[0].find_all('tr') print(type(rows)) print(rows) #Ignore the first row of rows, with expression rows = rows[1:] rows=rows[1:] #For every row perform the following tasks. #Find all columns associated with a row. Hint: Make use of find_all on each row and look for td tags. #Select only the column from the obtained list of columns. #print the text associated with first column. Hint: Make use of get_text method on the obatined column. for r in rows: print(r.find_all('td')[0].get_text()) #How many number of words are obtained when the sentence Python is cool!!! is tokenized into words, with regular expression r'\w+' exp=r'\w+' sent="Python is cool!!!" print(len(nltk.regexp_tokenize(sent, exp))) print(len(nltk.sent_tokenize(sent)))

Search This Blog

TechMusings (BigData,Hadoop,Pig,Hive,DataScience,IoT,EAI,SOA,J2EE)

nlp basics beautiful soup

Comments

Post a Comment

Popular posts from this blog

The auxService:mapreduce_shuffle does not exist

Zeppelin and Anaconda

Logistic Regression using German Credit Data