Monday, July 20, 2015
Scraping Script cu Selenium
library('RSelenium')
library('R2HTML')
checkForServer() # search for and download Selenium Server java binary. Only need to run once.
startServer() # run Selenium Server binary
remDr <- remoteDriver(browserName="firefox", port=4444) # instantiate remote driver to connect to Selenium Server
#Deschide Browser
remDr$open(silent=T) # open web browser
#Acces Site
remDr$navigate("http://admitere.edu.ro/Pages/CandInJud.aspx?jud=4&alfa=2")
#Alegere din dropdown pagina dorita de la care porneste iteratia
option <- remDr$findElement(using = 'xpath', "//*/option[@value = '440']")
option$clickElement()
#iteratii si salvare html
library('XML')
master <- c()
n <- 180 # number of pages to scrape. 80 pages in total. I just scraped 5 pages for this example.
for(i in 1:n) {
webElem <- remDr$findElement(using = 'xpath', "//*/table[@class = 'mainTable']")
tableHTML <- webElem$getElementAttribute("outerHTML")[[1]]
tableHTML <-htmlParse(tableHTML, encoding = "UTF-8")
out = tableHTML
HTML(out, file ="admitereLiceu2015_4.html", append = TRUE)
webElem1 <- remDr$findElement(using = 'name', value = "ctl00$ContentPlaceHolderBody$ImageButtonDR1")
webElem1$clickElement()
Sys.sleep(3)
}
head(master)
=====================Sand box====
#Click
webElem <- remDr$findElement(using = 'name', value = "ctl00$ContentPlaceHolderBody$ImageButtonDR1")
webElem$clickElement()
remDr$open(silent=T) # open web browser
webElem <- remDr$findElement(using = 'xpath', "//*/table[@class = 'mainTable']")
tableHTML <- webElem$getElementAttribute("outerHTML")[[1]]
foo <- readHTMLTable(webElem, which = 1)
library('XML')
master <- c()
n <- 5 # number of pages to scrape. 80 pages in total. I just scraped 5 pages for this example.
for(i in 1:n) {
site <- paste0("https://www.fidelity.com/fund-screener/evaluator.shtml#!&ntf=N&ft=BAL_all&msrV=advanced&sortBy=FUND_MST_MSTAR_CTGY_NM&pgNo=",i) # create URL for each page to scrape
remDr$navigate(site) # navigates to webpage
elem <- remDr$findElement(using="id", value="tbody") # get big table in text string
elem$highlightElement() # just for interactive use in browser. not necessary.
elemtxt <- elem$getElementAttribute("outerHTML")[[1]] # gets us the HTML
elemxml <- htmlTreeParse(elemtxt, useInternalNodes=T) # parse string into HTML tree to allow for querying with XPath
fundList <- unlist(xpathApply(elemxml, '//input[@title]', xmlGetAttr, 'title')) # parses out just the fund name and ticker using XPath
master <- c(master, fundList) # append fund lists from each page together
}
head(master)
appData <- webElem$getElementAttribute("outerHTML")[[1]]
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment