程式進修－她家

Nov 03 Tue 2020 00:51
HTML概念課

{基本構造}

<!DOCTYPE html>

她家發表在痞客邦留言(0) 人氣()

個人分類：程式進修

Oct 28 Wed 2020 19:55
PTT八卦熱門爬蟲-爬名字含有5566的ID

import requests
import time
import json
from bs4 import BeautifulSoup

PTT_url = 'https://www.ptt.cc'

def get_web_page(url):
    resp = requests.get(
        url = url,
        cookies = {'over18':'1'}
    )
    if resp.status_code !=200:
        print('Ivalid url:', resp.url)
        return None
    else:
        return resp.text

def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html.parser')
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']
    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''  # author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })
        return articles, prev_url

def get_author_ids(posts, pattern):
    ids = set()
    for post in posts:
        if pattern in post['author']:
            ids.add(post['author'])
    return ids

if __name__ == '__main__':
    current_page = get_web_page(PTT_url + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []
        today = time.strftime("%m/%d").lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)
        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_url + prev_url)
            current_articles, prev_url = get_articles(current_page, today)
        print(get_author_ids(articles, '5566'))
        print('今天有', len(articles), '篇文章')
        threshold = 50
        print('熱門文章(> %d 推):' % (threshold))
        for a in articles:
            if int(a['push_count']) > threshold:
                print(a)
        with open('gossiping.json', 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)

她家發表在痞客邦留言(0) 人氣()

個人分類：程式進修

▲top

Oct 24 Sat 2020 01:11
Dcard熱門文章爬蟲

import requests
import re
from bs4 import BeautifulSoup


def main():
    URL = 'https://www.dcard.tw/f'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    resp = requests.get(URL, headers=headers)
    soup = BeautifulSoup(resp.text, 'html.parser')

    articles = []
    # 找出所有 role="article" 的 <article> tag
    article = soup.find_all('article', {"role": "article"})
    for div in article:
        author = div.find("div",{'class': 'euk31c-2 kFrzdN'}).text.strip()
        title = div.find("h2").text.strip()
        href = div.h2.a['href']
        text = div.find("div", class_="uj732l-0 eLjDMI").text.strip()

        articles.append({
            'author':author,
            'title': title,
            'href' : href, 
            'text' : text
        })
    print(articles, (len(articles)))


if __name__ == '__main__':
    main()