## for https://stackoverflow.com/q/74650912/6146136 from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import pandas ## PASTE getDECstars FROM https://pastebin.com/Q0GLwRv9 ## without selenium ## PASTE [some verion of] linkToSoup FROM https://pastebin.com/rBTr06vy ## without selenium def login_to_gd(driver, tmout=10, lEmail='YOUR_EMAIL', lPwd='YOUR_PASSWORD'): # not needed if you want to login manually try: WebDriverWait(driver, tmout).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserEmail'))) uemInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserEmail') driver.execute_script("arguments[0].click();", uemInp) uemInp.send_keys(lEmail, Keys.ENTER) WebDriverWait(driver, tmout).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserPassword'))) pwdInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserPassword') driver.execute_script("arguments[0].click();", pwdInp) pwdInp.send_keys(lPwd, Keys.ENTER) WebDriverWait(driver, tmout).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-test="profile-container"]'))) except Exception as e: print(e) input('Please login manually and then press ENTER here') def formCssKey(scEl): if type(scEl) == list: c = [str(s) for s in scEl] else: c = str(scEl.get_attribute('class')).split() return ' '.join(sorted(w for w in c if w.startswith('css-'))) def cssToStars(cStr, outOf=5): try: str_bfr, str_aft = 'linear-gradient(90deg, rgb(12, 170, 65) ', '%, rgb(222, 224, 227) ' perc = float(cStr.split(str_bfr, 1)[1].split(str_aft)[0]) if type(outOf) == int and outOf > 0: perc = (perc/100)*outOf return float(f'{perc: .3}') except: return None def linkToSoup_selenium(driver, tmout=10, isv=False, returnErr=False): try: WebDriverWait(driver, tmout).until( EC.visibility_of_all_elements_located(( By.CSS_SELECTOR, 'li[id^="empReview_"]'))) subRatSel = 'li[id^="empReview_"] div:has(> .ratingNumber) ~ aside ul > li div:nth-of-type(2)' starConts = driver.find_elements(By.CSS_SELECTOR, subRatSel) starConts = { formCssKey(s): s.value_of_css_property('background') for s in starConts } lSoup = BeautifulSoup(driver.page_source, 'html.parser') return lSoup, starConts except Exception as e: if isv: print(e) return (str(e) if returnErr else None), {} def scrape_gdRevs(pgUrl, csvFn='empRevs.csv', constBreak=5, breaktime=5, maxScrapes=500): try: prevDf = pandas.read_csv(csvFn) prevs = list(prevDf['reviewId']) empRevs = prevDf.to_dict('records') except: prevs, empRevs = [], [] total_allTime = len(empRevs) total_current = 0 ### JUST FOR STATS ### try: scrapeLogs = pandas.read_csv(f'scrapeLogs_{csvFn}').to_dict('records') except: scrapeLogs = [] ###################### totalRevs = 'UNKNOWN' pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std' pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth' refDict = { 'rating_num': 'span.ratingNumber', 'emp_status': 'div:has(> div > span.ratingNumber) + span', 'header': 'h2 > a.reviewLink', 'subheader': 'h2:has(> a.reviewLink) + span', 'pros': f'{pcDiv}:first-of-type > p.pb', 'cons': f'{pcDiv}:nth-of-type(2) > p.pb' } # I copy chromedriver.exe to the same folder as this py file ## for selenium driverG = webdriver.Chrome() ## for selenium driverG.get('https://www.glassdoor.com') ## for selenium login_to_gd(driverG) # REMOVE if you want to login manually ## for selenium # input('Please LogIn and then press enter here') # manual login ## for selenium driverG.get(pgUrl) ## for selenium subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)' pgftSel = 'div[data-test="pagination-footer-text"]' for sci in range(maxScrapes): scn = f'[{sci + 1} of {maxScrapes}]' print('', end=f'\r{scn} scraping {pgUrl}') # soup = linkToSoup(pgUrl, isv=True, returnErr=True) ## without selenium soup, srDict = linkToSoup_selenium(driverG, isv=True, returnErr=True) ## for selenium if type(soup) == str: scrapeLogs.append( {'scrapeNum': sci+1, 'errorMsg': soup, 'url': pgUrl} ) # JUST FOR STATS ### # break # if you want to stop at first error # OR take a break: waitMsg = f'!{soup}! {breaktime*sci}s break before retrying' print('', end=f'\r{scn} {waitMsg} {pgUrl}') time.sleep(breaktime*sci) continue ### JUST FOR STATS ### try: curPg = soup.select_one('li a.page.selected').get_text().strip() except: curPg = 'UNKNOWN' if curPg.isdigit(): curPg = int(curPg) try: ftrTxt = soup.select_one(pgftSel).get_text().strip() except: ftrTxt = 'reviewCount UNKNOWN' try: tRevs = ftrTxt.strip().strip().split('of')[-1].split()[0].replace(',', '') except: tRevs = 'UNKNOWN' if tRevs.isdigit(): totalRevs = int(tRevs) print('', end=f'\r{scn} scraping "{ftrTxt}" from page#{curPg} {pgUrl}') ###################### newRevIds, pgRevIds = [], [] # JUST FOR STATS ### rSoups = soup.select('li[id^="empReview_"]') for r in rSoups: rId = r.get('id') pgRevIds.append(rId) # JUST FOR STATS ### if rId in prevs: continue # skip duplicates newRevIds.append(rId) # JUST FOR STATS ### rDet = {'reviewId': rId} for sr in r.select(subRatSel): k = sr.select_one('div:first-of-type').get_text(' ').strip() # sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup) ## without selenium kc = formCssKey(sr.select_one('div:nth-of-type(2)').get('class', [])) ## for selenium sval = cssToStars(srDict[kc]) if kc in srDict else None ## for selenium rDet[f'[rating] {k}'] = sval for k, sel in refDict.items(): sval = r.select_one(sel) if sval: sval = sval.get_text(' ').strip() rDet[k] = sval empRevs.append(rDet) prevs.append(rId) pandas.DataFrame(empRevs).to_csv(csvFn, index=False) total_current += len(newRevIds) total_allTime = len(empRevs) ### JUST FOR STATS ### for_sl = { 'scrapeNum': sci+1, 'curPg': curPg, 'totalRevs': tRevs, 'pgFooter': ftrTxt, 'allCt': len(pgRevIds), 'uniqCt': len(set(pgRevIds)), 'newCt': len(newRevIds), 'allRevs': pgRevIds, 'newRevs': newRevIds, 'url': pgUrl } if not rSoups: for_sl['errorMsg'] = 'No reviews found in ' + ''.join([ ' '.join(w for w in l.split() if w) for l in str(soup).splitlines() if l ]) scrapeLogs.append(for_sl) pandas.DataFrame(scrapeLogs).to_csv(f'scrapeLogs_{csvFn}', index=False) ###################### rCt = len(rSoups) print(f'\r{scn} scraped {rCt} "{ftrTxt}" from page#{curPg} {pgUrl}') # nextPg = soup.select_one('li:has(a.page.selected) + li a.page[href]') ## without selenium nextPg = driverG.find_elements(By.CSS_SELECTOR, 'li:has(a.page.selected) + li a.page[href]') ## for selenium if nextPg: # pgUrl = 'https://www.glassdoor.com/' + nextPg.get('href') ## without selenium if constBreak > 0: time.sleep(constBreak) scrollElToBottom = 'arguments[0].scrollIntoView(false);' ## for selenium driverG.execute_script(scrollElToBottom, nextPg[0]) ## for selenium nextPg[0].click() ## for selenium elif not rSoups: # remove if you want to stop at first error print('', end=f'\r{scn} {breaktime*sci}s break before retrying {pgUrl}') time.sleep(breaktime*sci) else: break # last page driverG.close() ## for selenium del driverG # (just in case) ## for selenium print('\n\n\n total reviews: ', totalRevs) print('total reviews scraped this run:', total_current) print('total reviews scraped over all time:', total_allTime) # startUrl = 'https://www.glassdoor.com/Reviews/Walmart-Reviews-E715.htm?filter.iso3Language=eng' # scrape_gdRevs(startUrl)