close
import requests import time import json from bs4 import BeautifulSoup PTT_url = 'https://www.ptt.cc' def get_web_page(url): resp = requests.get( url = url, cookies = {'over18':'1'} ) if resp.status_code !=200: print('Ivalid url:', resp.url) return None else: return resp.text def get_articles(dom, date): soup = BeautifulSoup(dom, 'html.parser') paging_div = soup.find('div', 'btn-group btn-group-paging') prev_url = paging_div.find_all('a')[1]['href'] articles = [] divs = soup.find_all('div', 'r-ent') for d in divs: if d.find('div', 'date').text.strip() == date: push_count = 0 push_str = d.find('div', 'nrec').text if push_str: try: push_count = int(push_str) except ValueError: if push_str == '爆': push_count = 99 elif push_str.startswith('X'): push_count = -10 if d.find('a'): href = d.find('a')['href'] title = d.find('a').text author = d.find('div', 'author').text if d.find('div', 'author') else '' # author = d.find('div', 'author').text if d.find('div', 'author') else '' articles.append({ 'title': title, 'href': href, 'push_count': push_count, 'author': author }) return articles, prev_url def get_author_ids(posts, pattern): ids = set() for post in posts: if pattern in post['author']: ids.add(post['author']) return ids if __name__ == '__main__': current_page = get_web_page(PTT_url + '/bbs/Gossiping/index.html') if current_page: articles = [] today = time.strftime("%m/%d").lstrip('0') current_articles, prev_url = get_articles(current_page, today) while current_articles: articles += current_articles current_page = get_web_page(PTT_url + prev_url) current_articles, prev_url = get_articles(current_page, today) print(get_author_ids(articles, '5566')) print('今天有', len(articles), '篇文章') threshold = 50 print('熱門文章(> %d 推):' % (threshold)) for a in articles: if int(a['push_count']) > threshold: print(a) with open('gossiping.json', 'w', encoding='utf-8') as f: json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)
全站熱搜