Monday, July 20, 2015

Scraping Script cu Selenium

library('RSelenium') library('R2HTML') checkForServer() # search for and download Selenium Server java binary. Only need to run once. startServer() # run Selenium Server binary remDr <- remoteDriver(browserName="firefox", port=4444) # instantiate remote driver to connect to Selenium Server #Deschide Browser remDr$open(silent=T) # open web browser #Acces Site remDr$navigate("http://admitere.edu.ro/Pages/CandInJud.aspx?jud=4&alfa=2") #Alegere din dropdown pagina dorita de la care porneste iteratia option <- remDr$findElement(using = 'xpath', "//*/option[@value = '440']") option$clickElement() #iteratii si salvare html library('XML') master <- c() n <- 180 # number of pages to scrape. 80 pages in total. I just scraped 5 pages for this example. for(i in 1:n) { webElem <- remDr$findElement(using = 'xpath', "//*/table[@class = 'mainTable']") tableHTML <- webElem$getElementAttribute("outerHTML")[[1]] tableHTML <-htmlParse(tableHTML, encoding = "UTF-8") out = tableHTML HTML(out, file ="admitereLiceu2015_4.html", append = TRUE) webElem1 <- remDr$findElement(using = 'name', value = "ctl00$ContentPlaceHolderBody$ImageButtonDR1") webElem1$clickElement() Sys.sleep(3) } head(master) =====================Sand box==== #Click webElem <- remDr$findElement(using = 'name', value = "ctl00$ContentPlaceHolderBody$ImageButtonDR1") webElem$clickElement() remDr$open(silent=T) # open web browser webElem <- remDr$findElement(using = 'xpath', "//*/table[@class = 'mainTable']") tableHTML <- webElem$getElementAttribute("outerHTML")[[1]] foo <- readHTMLTable(webElem, which = 1) library('XML') master <- c() n <- 5 # number of pages to scrape. 80 pages in total. I just scraped 5 pages for this example. for(i in 1:n) { site <- paste0("https://www.fidelity.com/fund-screener/evaluator.shtml#!&ntf=N&ft=BAL_all&msrV=advanced&sortBy=FUND_MST_MSTAR_CTGY_NM&pgNo=",i) # create URL for each page to scrape remDr$navigate(site) # navigates to webpage elem <- remDr$findElement(using="id", value="tbody") # get big table in text string elem$highlightElement() # just for interactive use in browser. not necessary. elemtxt <- elem$getElementAttribute("outerHTML")[[1]] # gets us the HTML elemxml <- htmlTreeParse(elemtxt, useInternalNodes=T) # parse string into HTML tree to allow for querying with XPath fundList <- unlist(xpathApply(elemxml, '//input[@title]', xmlGetAttr, 'title')) # parses out just the fund name and ticker using XPath master <- c(master, fundList) # append fund lists from each page together } head(master) appData <- webElem$getElementAttribute("outerHTML")[[1]]

No comments:

Post a Comment