R-Studio Web Crawling 하기

R-Studio의 rvest패키지를 이용해 크롤링 하기.

Featured image

사용 예시 소스코드

library("rvest")
url <- 'https://comic.naver.com/webtoon/weekday.nhn'
html <- read_html(url)

# thumb class로 찾아 들어감
div_lists <- html_nodes(html, ".thumb")
div_lists

atag <- html_nodes(div_lists, 'a')
atag

# tag그 안에 href 태그 찾아들어가기.
attr <- html_attr(atag, "href")
attr

# 데이터 거르기
attr <- gsub("/webtoon/list.nhn", "", attr)
attr <- gsub("\\?", "", attr)
attr

############################################################### 세부 자르기
# library(stringr)
# del <- '&'
# a <- str_split(attr[1], del, simplify = T)
# a
# 
# # title id값
# del01 <- '='
# id <- str_split(a[1], del01, simplify = T)
# id
# 
# # title weekday value
# data <- str_split(a[2], del01, simplify = T)
# data
#############################################################

# 제목 가져오기
myTitle <- html %>% html_nodes(".thumb") %>% html_nodes("img") %>% html_attr("title")
myTitle

# 주소가져오기
mySrc <- html %>% html_nodes(".thumb") %>% html_nodes("img") %>% html_attr("src")
mySrc

# 반복할 전체 갯수
it <- c(1:length(attr))
it

df <- data.frame()
for (idx in it) {
  del <- '&'
  mydata <- str_split(attr[idx], del, simplify = T) # &로 자르기
  
  del <- '='
  title_id <- str_split(mydata[1], del, simplify = T) # &로 자르기
  weekday <- str_split(mydata[2], del, simplify = T) # &로 자르기
  
  temp <- data.frame(title_id[2], weekday[2], myTitle[idx], mySrc[idx])
  df <- rbind(df, temp)
}
df

colnames(df) <- c("titleid", "weekday", "myTitle", "mySrc")
write.csv(df, "naver_cartoons.csv", row.names = F, quote = T)

#########################################################################

library(dplyr)
library(stringr)
library(rvest)

chKType <- function(val) {
  cat("자료형 : ", mode(val), "\n")
  cat("클래스 : ", class(val), "\n")
  cat("=====================================================\n")
  cat("구조 : ", str(val))
}

url <- 'https://www.bobaedream.co.kr/cyber/CyberCar.php?cat=2'
html <- read_html(url)
html

# thumb class로 찾아 들어감
# div_lists <- html_nodes(html, ".thumb")
div_lists <- html_nodes(html, "#listCont")
div_lists

thu <- html_nodes(div_lists, ".thumb")
thu

atag <- html_nodes(thu, 'a')
atag

# tag그 안에 href 태그 찾아들어가기.
attr <- html_attr(atag, "href")
attr

# 데이터 거르기
attr <- gsub("/cyber/CyberCar_view.php", "", attr)
attr <- gsub("\\?", "", attr)
attr

# title 가져오기
myTitle <- html %>% html_nodes(".thumb") %>% html_nodes("img") %>% html_attr("alt")
myTitle
# 그림 주소가져오기
mySrcimg <- html %>% html_nodes(".thumb") %>% html_nodes("img") %>% html_attr("src")
mySrcimg
# 연식 가져오기
carYear <- div_lists %>% html_nodes(".year") %>% html_text()
carYear <- gsub("\\r", "", carYear)
carYear <- gsub("\\n", "", carYear)
carYear <- gsub("\\t", "", carYear)
carYear <- str_trim(carYear)
head(carYear, 5)

it <- c(1:length(attr))
it

df <- data.frame()
for (idx in it) {
  del <- '&'
  mydata <- str_split(attr[idx], del, simplify = T) # &로 자르기
  
  del <- '='
  title_id <- str_split(mydata[1], del, simplify = T) # &로 자르기
  gubun <- str_split(mydata[2], del, simplify = T) # &로 자르기
  
  temp <- data.frame(title_id[2], ifelse(gubun[2]=="I", "외산", "국산"), myTitle[idx], mySrcimg[idx], carYear[idx])
  df <- rbind(df, temp)
}
df

colnames(df) <- c("titleid", "gubun", "myTitle", "mySrcimg", "연식")
write.csv(df, "car1.csv", row.names = F, quote = T)

# 스포츠카