1. 네이버 카페 지정 게시판 내용 크롤링
- 카페ID와 게시판 번호 확인 필요
- 해당 게시판의 내용(List)을 전부 크롤링
import pandas as pd
import requests
import time
from datetime import datetime as dt
from bs4 import BeautifulSoup
my_header = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
}
df = pd.DataFrame(columns=["No", "날짜", "제목", "댓글수", "조회수", "좋아요", "사진", "글쓴이", "본문링크"])
index_no = 0
pageNo = 614
end_page = -1 # -1이면 끝까지 다 돌려라
cafeid= 10050813 # 파우더룸 카페ID
menuid = 42 # 게시판 번호 아래 딕셔너리 참고
match_box = {"51": "피부케어팁", "92": "스킨케어Q&A", "31": "스킨케어리뷰", "39": "클렌징리뷰", "40": "트러블케어리뷰", "42": "선케어리뷰"}
while True:
url = f"https://cafe.naver.com/ArticleList.nhn?search.clubid={cafeid}&search.menuid={menuid}&search.boardtype=L&search.totalCount=151&search.cafeId=10050813&search.page={pageNo}"
res = requests.get(url, headers=my_header)
html = BeautifulSoup(res.text, "html.parser")
boxitems = []
dummyitems = html.find_all("tr", {"id": None, "class": None})
for li in dummyitems:
if li.select("td.td_article") != []:
boxitems.append(li)
if end_page + 1 != pageNo:
if len(boxitems) > 1:
for li in boxitems:
reply = 0
pictures = 0
likey = 0
views = 0
if li.select_one("a.cmt >em"): reply = li.select_one("a.cmt > em").text
if li.select_one("span.list-i-img"): pictures = li.select_one("span.list-i-img").text
if li.select_one("td.td_likes"): likey = li.select_one("td.td_likes").text
if li.select_one("td.td_view"): views = li.select_one("td.td_view").text
df.loc[index_no] = {
"No": li.select_one('div.inner_number').text,
"날짜": li.select_one('td.td_date').text,
"제목": li.select_one("a.article").text.replace("\n", "").replace("\t", "").strip(),
"댓글수": reply,
"조회수": views,
"좋아요": likey,
"사진": pictures,
"글쓴이": li.select_one("td.td_name").text.replace("\n", ""),
"본문링크": f"https://cafe.naver.com/ArticleRead.nhn?clubid={cafeid}&page={pageNo}&menuid={menuid}&boardtype=L&articleid={li.select_one('div.inner_number').text}&referrerAllArticles=false",
}
index_no += 1
else:
print("-" * 100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at LastPage {pageNo - 1}, Thank You!!")
break
time.sleep(0.6)
else:
print("-"*100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at PageNo {pageNo-1}, Thank You!!")
break
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Url Request Success, PageNo is {pageNo}")
pageNo += 1
df.to_excel("data/파우더룸 "+ match_box[f'{menuid}']+f" 크롤링_{dt.now().strftime('%Y%m%d_%Hh%Mm')}.xlsx")