1. 검색 키워드를 가지고 블로그 게시글들을 크롤링
- 시작 페이지와 끝 페이지를 지정
- Pandas로 Dataframe화 시켜서 Excel로 추출
keyword = "수원 맛집"
pageNo = 1
end_page = 3 # -1이면 끝까지 다 돌려라
page_index = (pageNo-1)*30+1
df=pd.DataFrame(columns=["제목", "링크", "작성일", "작성자", "썸네일", "첨부사진수","상세내용"])
index = 0
while True:
url=f'https://s.search.naver.com/p/blog/search.naver?where=blog&sm=tab_pge&api_type=1&query={keyword}&rev=44&start={page_index}&dup_remove=1&post_blogurl=&post_blogurl_without=&nso=&nlu_query='+'{"r_category":"29"}'+f'&dkey=0&source_query=&nx_search_query={keyword}&spq=0&_callback=viewMoreContents'
res = requests.get(url)
html = BeautifulSoup(res.text.replace("\\", ""))
boxitems=html.select("li")
if end_page+1 != pageNo:
if len(boxitems) > 1 :
for li in boxitems:
write_date = li.select_one("span.sub_time").text
if write_date[-4:] == "시간 전":
dd = datetime.date.today().day
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
elif write_date == "어제":
dd = datetime.date.today().day - 1
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
elif write_date[1:] == "일 전":
dd = datetime.date.today().day - int(write_date[0])
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
df.loc[index] = {
'작성일' : write_date,
'작성자' : li.select_one("a.sub_txt").text,
'제목' : li.select_one("a.api_txt_lines").text,
'링크' : li.select_one("a.api_txt_lines")["href"],
'상세내용' : li.select_one("div.api_txt_lines").text.strip(),
'썸네일' : li.select_one("img.thumb")["src"],
'첨부사진수' : li.select_one("span.thumb_count").text,
}
index += 1
else:
print("-" * 100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at LastPage {pageNo - 1}, Thank You!!")
break
time.sleep(0.5)
else:
print("-"*100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at PageNo {pageNo-1}, Thank You!!")
break
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Url Request Success, PageNo is {pageNo}")
pageNo += 1
df.to_excel(f"data/블로그 {keyword} 크롤링_{dt.now().strftime('%Y%m%d_%Hh%Mm')}.xlsx")
2. 네이버 카페 검색어 크롤링
- 시작페이지와 끝페이지 지정
- 제목, 링크, 작성일, 카페명, 상세내용 크롤링
import time
import requests
from bs4 import BeautifulSoup
import datetime
from datetime import datetime as dt
import pandas as pd
keyword = "기초 스킨케어"
pageNo = 1
end_page = 3 # -1이면 끝까지 다 돌려라
page_index = (pageNo-1)*10+1
df=pd.DataFrame(columns=["제목", "링크", "작성일", "카페명", "상세내용"])
index = 0
while True:
url=f'https://s.search.naver.com/p/cafe/search.naver?where=article&ie=utf8&query={keyword}&prdtype=0&t=0&st=rel&srchby=text&dup_remove=1&cafe_url=&without_cafe_url=&sm=tab_opt&nso_open=0&rev=44&abuse=0&ac=0&aq=0&converted=0&is_dst=0&nqx_context=&nx_and_query=&nx_search_hlquery=&nx_search_query=&nx_sub_query=&people_sql=0&spq=0&x_tab_article=&is_person=0&start={page_index}&display=10&prmore=1&_callback=viewMoreContents'
res = requests.get(url)
html = BeautifulSoup(res.text.replace("\\", ""))
boxitems=html.select("li")
if end_page + 1 != pageNo:
if len(boxitems) > 1:
for li in boxitems:
write_date = li.select_one("span.sub_time").text
if write_date[-4:] == "시간 전":
dd = datetime.date.today().day
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
elif write_date == "어제":
dd = datetime.date.today().day - 1
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
elif write_date[1:] == "일 전":
dd = datetime.date.today().day - int(write_date[0])
write_date = datetime.date.today().strftime(f"%Y.%m.{dd}")
df.loc[index] = {
'작성일': write_date,
'카페명': li.select_one("a.sub_txt").text,
'제목': li.select_one("a.api_txt_lines").text,
'링크': li.select_one("a.api_txt_lines")["href"],
'상세내용': li.select_one("div.api_txt_lines").text.strip(),
}
index += 1
else:
print("-" * 100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at LastPage {pageNo - 1}, Thank You!!")
break
time.sleep(0.5)
else:
print("-"*100)
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Crawling is finished at PageNo {pageNo-1}, Thank You!!")
break
print(f"{dt.now().strftime('%Y-%m-%d %H:%M:%S')} Url Request Success, PageNo is {pageNo}")
pageNo += 1
df.to_excel(f"data/블로그 {keyword} 크롤링_{dt.now().strftime('%Y%m%d_%Hh%Mm')}.xlsx")