1. 네이버 카페 지정 게시판 내용 크롤링

  • 카페ID와 게시판 번호 확인 필요
  • 해당 게시판의 내용(List)을 전부 크롤링
import pandas as pd
import requests
import time
from datetime import datetime as dt
from bs4 import BeautifulSoup

my_header = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
}

df = pd.DataFrame(columns=["No", "날짜", "제목", "댓글수", "조회수", "좋아요", "사진", "글쓴이", "본문링크"])
index_no = 0

pageNo = 614
end_page = -1 # -1이면 끝까지 다 돌려라
cafeid= 10050813 # 파우더룸 카페ID
menuid = 42  # 게시판 번호 아래 딕셔너리 참고
match_box = {"51": "피부케어팁", "92": "스킨케어Q&A", "31": "스킨케어리뷰", "39": "클렌징리뷰", "40": "트러블케어리뷰", "42": "선케어리뷰"}

while True:
    url = f"https://cafe.naver.com/ArticleList.nhn?search.clubid={cafeid}&search.menuid={menuid}&search.boardtype=L&search.totalCount=151&search.cafeId=10050813&search.page={pageNo}"
    res = requests.get(url, headers=my_header)
    html = BeautifulSoup(res.text, "html.parser")
    boxitems = []
    dummyitems = html.find_all("tr", {"id": None, "class": None})

    for li in dummyitems:
        if li.select("td.td_article") != []:
            boxitems.append(li)

    if end_page + 1 != pageNo:
        if len(boxitems) > 1:
            for li in boxitems:
                reply = 0
                pictures = 0
                likey = 0
                views = 0
                if li.select_one("a.cmt >em"): reply = li.select_one("a.cmt > em").text
                if li.select_one("span.list-i-img"): pictures = li.select_one("span.list-i-img").text
                if li.select_one("td.td_likes"): likey = li.select_one("td.td_likes").text
                if li.select_one("td.td_view"): views = li.select_one("td.td_view").text
                df.loc[index_no] = {
                    "No": li.select_one('div.inner_number').text,
                    "날짜": li.select_one('td.td_date').text,
                    "제목": li.select_one("a.article").text.replace("\n", "").replace("\t", "").strip(),
                    "댓글수": reply,
                    "조회수": views,
                    "좋아요": likey,
                    "사진": pictures,
                    "글쓴이": li.select_one("td.td_name").text.replace("\n", ""),
                    "본문링크": f"https://cafe.naver.com/ArticleRead.nhn?clubid={cafeid}&page={pageNo}&menuid={menuid}&boardtype=L&articleid={li.select_one('div.inner_number').text}&referrerAllArticles=false",
                }
                index_no += 1
        else:
            print("-" * 100)
            print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at LastPage {pageNo - 1}, Thank You!!")
            break
        time.sleep(0.6)


    else:
        print("-"*100)
        print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at PageNo {pageNo-1}, Thank You!!")
        break
    print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Url Request Success, PageNo is {pageNo}")
    pageNo += 1

df.to_excel("data/파우더룸 "+ match_box[f'{menuid}']+f" 크롤링_{dt.now().strftime('%Y%m%d_%Hh%Mm')}.xlsx")

+ Recent posts