diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 03f2367..5a570b5 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -49,14 +49,13 @@ def startCrawling(): def login(driver): #click login button login_link = driver.find_element( - by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a').\ - get_attribute('href') + by=By.XPATH, value='/html/body/div/div[1]/div[2]/div[1]/div/span/a[1]').get_attribute('href') driver.get(login_link) #entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') #Username here - usernameBox.send_keys('purely_cabbage') + usernameBox.send_keys('cabbage_purely') passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') #Password here passwordBox.send_keys('$ourP@tchK1ds') @@ -176,26 +175,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Hacking & Cracking tutorials + # Hacking & Cracking tutorials links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials') # Hacking & Cracking questions - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') - # # Exploit PoCs - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') - # # Cracked software - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Cracked-software') - # # Malware-development - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') - # # Carding & Fraud - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') - # # Darknet Discussions - # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') - # # OPSEC - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC') - # # Databases - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Databases') - # # Proxies - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Proxies') + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') + # Exploit PoCs + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') + # sellers + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers') + # buyers questions + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions') + # combo lists + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists') + # Malware-development + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') + # coding + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding') + # Carding & Fraud + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') + # OPSEC + links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13') return links @@ -235,13 +234,13 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: - temp = driver.find_element(By.XPATH,'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/ - page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') # /html/body/div/div[2]/div/div[2]/div + temp = driver.find_element(by=By.CLASS_NAME, value='float_left') + page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if page == "": raise NoSuchElementException @@ -253,15 +252,20 @@ def crawlForum(driver): for j in range(counter): driver.back() - # comment out - # break + try: + driver.get(link) + except: + driver.refresh() - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div + temp = driver.find_element(by=By.CLASS_NAME, value='float_left') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if link == "": diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py index 98cfb68..72674b1 100644 --- a/Forums/OnniForums/parser.py +++ b/Forums/OnniForums/parser.py @@ -79,34 +79,35 @@ def onniForums_description_parser(soup: BeautifulSoup) -> tuple: # Append a "-1" to `feedbacks` array since they don't exists on this forum feedbacks.append("-1") - - - date_posted: str = topic.find("span", {"class": "post_date"}).text - date_posted_cleaned = cleanString(date_posted.split(",")[0]) - - today = datetime.now() - - if date_posted_cleaned == 'Yesterday': - date_object = today - timedelta(days=1) - - elif date_posted_cleaned.find('hour') > 0: - hours_ago = int(date_posted_cleaned.split(' ')[0]) - date_object = today - timedelta(hours=hours_ago) - - elif date_posted_cleaned.find('minute') > 0: - minutes_ago = int(date_posted_cleaned.split(' ')[0]) - date_object = today - timedelta(minutes=minutes_ago) - + + date_posted = topic.find("span", {"class": "post_date"}).text.strip() + if 'modified' in date_posted: + date_posted = date_posted.split('(')[0].strip() + + if 'Today' in date_posted or 'Yesterday' in date_posted: + day = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() + time = date_posted.split(',')[1].strip() + date_posted = day + ', ' + time + date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") + + elif 'hour' in date_posted or 'minute' in date_posted: + date_posted = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() + date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") + else: - date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y") + date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") addDates.append(date_object) image_post.append("-1") - img = topic.find('div', {"class": "author_avatar"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] + avatar = topic.find('div', {"class": "author_avatar"}) + if avatar is not None: + img = avatar.find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = '-1' else: img = "-1" image_user.append(img)