-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdziadzkielczowa.py
More file actions
87 lines (61 loc) · 3.05 KB
/
Copy pathdziadzkielczowa.py
File metadata and controls
87 lines (61 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#%%import
from __future__ import unicode_literals
import requests
from bs4 import BeautifulSoup
import pandas as pd
import regex as re
import time
from time import mktime
from tqdm import tqdm #licznik
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import json
#%% def
def get_articles_links(sitemap):
html_text_sitemap = requests.get(sitemap).text
soup = BeautifulSoup(html_text_sitemap, 'lxml')
links = [e.text for e in soup.find_all('loc')]
return links
def dictionary_of_article(article_link):
html_text = requests.get(article_link).text
while 'Error 503' in html_text:
time.sleep(2)
html_text = requests.get(article_link).text
soup = BeautifulSoup(html_text, 'html.parser')
author = "Radosław Wiśniewski"
title_of_article = soup.find('h3', class_='post-title entry-title').text.strip()
date_of_publication = soup.find('time', class_='published')['datetime']
date_of_publication = re.findall(r'^([\d-]*)(?=T)', date_of_publication)[0]
article = soup.find('div', class_='post-body entry-content float-container')
text_of_article = article.text
try:
external_links = ' | '.join([x for x in [x['href'] for x in article.find_all('a')] if not re.findall(r'blogger|blogspot|dziadzkielczowa', x)])
except (AttributeError, KeyError, IndexError):
external_links = None
try:
photos_links = ' | '.join([x['src'] for x in article.find_all('img')])
except (AttributeError, KeyError, IndexError):
photos_links = None
dictionary_of_article = {'Link': article_link,
'Data publikacji': date_of_publication,
'Autor': author,
'Tytuł artykułu': title_of_article,
'Tekst artykułu': text_of_article,
'Linki zewnętrzne': False if external_links == '' else external_links,
'Zdjęcia/Grafika': True if [x['src'] for x in article.find_all('img')] else False,
'Linki do zdjęć': photos_links
}
all_results.append(dictionary_of_article)
#%% main
articles_links = get_articles_links('https://dziad-z-kielczowa.blogspot.com/sitemap.xml')
all_results = []
with ThreadPoolExecutor() as excecutor:
list(tqdm(excecutor.map(dictionary_of_article, articles_links),total=len(articles_links)))
with open(f'data\\dziadzkielczowa_{datetime.today().date()}.json', 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False)
df = pd.DataFrame(all_results).drop_duplicates()
df["Data publikacji"] = pd.to_datetime(df["Data publikacji"]).dt.date
df = df.sort_values('Data publikacji', ascending=False)
with pd.ExcelWriter(f"data\\dziadzkielczowa_{datetime.today().date()}.xlsx", engine='xlsxwriter', options={'strings_to_urls': False}) as writer:
df.to_excel(writer, 'Posts', index=False, encoding='utf-8')
writer.save()