From 41f3735787ab12815cb9a692ef506f1afe630b9b Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Wed, 20 Sep 2023 23:27:35 -0700 Subject: [PATCH] finished fully running completed Altenens and Cardingleaks --- Forums/Altenens/crawler_selenium.py | 26 ++++++++-------- Forums/Altenens/parser.py | 6 ++-- Forums/Cardingleaks/crawler_selenium.py | 22 ++++++++------ Forums/Cardingleaks/parser.py | 18 +++++++---- Forums/OnniForums/crawler_selenium.py | 40 ++++++++++++------------- 5 files changed, 62 insertions(+), 50 deletions(-) diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index d13847a..c9c4907 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -227,9 +227,9 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') @@ -240,15 +240,17 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() - - # comment out - break - - # comment out - if count == 1: - break + try: + driver.get(link) + except: + driver.refresh() + + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 8ea1fe0..e056cb2 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -25,8 +25,10 @@ def altenens_description_parser(soup): image_user = [] # 9 all user avatars of each post image_post = [] # 10 all first images of each post - topic = soup.find("h1", {"class": "p-title-value"}).text - topic = cleanString(topic.strip()) + etopic = soup.find("h1", {"class": "p-title-value"}) + if etopic is not None: + topic = etopic.text + topic = cleanString(topic.strip()) body = soup.find('div', {"class": "block-container lbContainer"}) iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"}) diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index 1e89751..97296ca 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -230,12 +230,16 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: + if isListingLink(driver.current_url): break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important + + # # comment out + # if counter == 2: + # break + try: page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') if page == "": @@ -248,12 +252,12 @@ def crawlForum(driver): for j in range(counter): driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py index f913243..a2da87b 100644 --- a/Forums/Cardingleaks/parser.py +++ b/Forums/Cardingleaks/parser.py @@ -109,8 +109,11 @@ def cardingleaks_listing_parser(soup: Tag): li = soup.find("h1", {"class": "p-title-value"}) board = cleanString(li.text.strip()) - thread_list = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}).find_all('div', {"data-author": True}) + \ - soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + thread_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + + sticky = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}) + if sticky is not None: + thread_list = sticky.find_all("div", {"data-author": True}) + thread_list nm = len(thread_list) @@ -123,9 +126,12 @@ def cardingleaks_listing_parser(soup: Tag): author_icon = thread.find("a", {"class": "avatar avatar--s"}) if author_icon is not None: - author_icon = author_icon.find('img').get('src') - author_icon = author_icon.split('base64,')[-1] - image_user.append(author_icon) + author_icon = author_icon.find('img') + if author_icon is not None: + author_icon = author_icon.get('src').split('base64,')[-1] + image_user.append(author_icon) + else: + image_user.append('-1') else: image_user.append('-1') @@ -158,4 +164,4 @@ def cardingleaks_links_parser(soup): href.append(link) - return href + return [href[-1]] diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 5a570b5..84aeb3e 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -177,24 +177,24 @@ def getInterestedLinks(): # Hacking & Cracking tutorials links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials') - # Hacking & Cracking questions - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') - # Exploit PoCs - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') - # sellers - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers') - # buyers questions - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions') - # combo lists - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists') - # Malware-development - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') - # coding - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding') - # Carding & Fraud - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') - # OPSEC - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13') + # # Hacking & Cracking questions + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') + # # Exploit PoCs + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') + # # sellers + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers') + # # buyers questions + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions') + # # combo lists + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists') + # # Malware-development + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') + # # coding + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding') + # # Carding & Fraud + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') + # # OPSEC + # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13') return links @@ -249,9 +249,7 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() - + # making sure we go back to the listing page (browser back button simulation) try: driver.get(link) except: