WiredSummarizer/fetchpopular.py at master · erikqu/WiredSummarizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
import urllib
import bs4
import requests
import textrank

setpage = "https://www.wired.com/"
link_list=[]
def remove_related(soup):
	links = (soup.find_all("li",{"class": "article-list-item-embed-component__post"}))

def remove_refs(input_string):
		tmp=""
		flag = False
		stringbuilder= ""
		stringbuilder = input_string
		for x in input_string:
			if x == "[":
				flag=True
				tmp = tmp+x
			elif flag == True and x == "]":
				tmp=tmp+x
				flag = False
				stringbuilder = stringbuilder.replace(tmp,"")
				tmp=""
				stringbuilder = " ".join(stringbuilder.split())
			elif flag == True:
				tmp=tmp +x
		return stringbuilder

def remove_trail(txt,delim):
	txt = txt.rsplit(delim, 1)[0]
	return txt

def create_summary(page):
	resp = urllib.request.urlopen(page)
	soup = bs4.BeautifulSoup(resp,'html.parser')

	title = (soup.find('h1')).text
	pgraph= soup.find_all('p')
	text=""
	for x in range((len(pgraph))):
		if "Use of this site constitutes" not in pgraph[x].text:
			text = text + pgraph[x].text

	text = remove_refs(text)

	summary = textrank.extract_sentences(text)
	summary = remove_trail(summary,".")
	summary = summary + "."
	print('\n', title, '\n')
	print(summary)

def getsoup():
	page = setpage
	resp = urllib.request.urlopen(page)
	soup = bs4.BeautifulSoup(resp,'html.parser')
	return soup
def remove_section(tag):
	buildup=""
	for x in tag:
		if x.islower():
			buildup = buildup+x
		else:
			break
	if len(buildup) > 0:
		buildup = tag.replace(buildup,"")
	return buildup

def remove_trail(txt,delim):
	txt = txt.rsplit(delim, 1)[0]
	return txt

def create_link(website,url):
		website = website + url
		return website

def get_most_popular():
	soup = getsoup()
	mostpopular = (soup.find_all("li", {"class": "post-listing-list-item__post"}))
	links = (soup.find_all("a",{"class": "post-listing-list-item__link"}))
	link_number=5
	link_names =[]
	for x in range(link_number):
		link_list.append(links[x].get('href'))
		text = remove_section(mostpopular[x].text)
		text = remove_trail(text,'Author')
		link_names.append(text)
		print('[',(x+1),']', text)
	selection = input("\nSelect article by # >")
	return selection,link_names

def main():
	print("\nWelcome to the WIRED summarizer!\nChoose an article to be summarized (or enter h for help):")
	choice,names = get_most_popular()
	while(True):
		if choice =='h':
			print("1-5 to summarize an article\no to open the article\nq to quit")
			choice = input("Select article by # >")
			continue
		elif choice == 'q':
			print("Quitting..")
			exit()
		elif choice == 'o':
			special = input("Enter the article #>")
			#finish me
		elif len(choice) >1:
			print("Not a valid command! Enter q to quit!")
			continue
		choice = int(choice)
		choice -=1
		urlname = setpage + link_list[choice]
		create_summary(urlname)
		print("\n")
		for x in range(len(names)):
			print('[',(x+1),']',names[x])
		choice = input("Select article by # >")

main()