From 0345836e20edacb80377d28781bb29a55e7dcb82 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Tue, 5 Sep 2023 17:59:33 -0700 Subject: [PATCH] user image tracking ONLY (missing post image) for some forums --- .idea/DW_Pipeline_Test.iml | 1 + Forums/Altenens/parser.py | 8 ++++ Forums/BestCardingWorld/parser.py | 5 ++ Forums/Cardingleaks/parser.py | 6 ++- Forums/CryptBB/parser.py | 5 ++ Forums/HiddenAnswers/parser.py | 17 ++++++- Forums/OnniForums/parser.py | 7 ++- Forums/Utilities/utilities.py | 8 ++++ MarketPlaces/Apocalypse/parser.py | 7 +-- MarketPlaces/GoFish/crawler_selenium.py | 28 +++-------- MarketPlaces/Torzon/crawler_selenium.py | 62 +++++++++---------------- 11 files changed, 82 insertions(+), 72 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index f27dbb9..9ee2f4c 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -28,6 +28,7 @@ diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 19155d5..bdad19d 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -22,6 +22,7 @@ def altenens_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) @@ -66,6 +67,13 @@ def altenens_description_parser(soup): date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) + img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py index c4ca6e0..5a294c6 100644 --- a/Forums/BestCardingWorld/parser.py +++ b/Forums/BestCardingWorld/parser.py @@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup): sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) post = [] # 7 all messages of each post interest = [] # 8 all user's interest in each post + image_user = [] # Finding the topic (should be just one coming from the Listing Page) @@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup): feedback.append("-1") + img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"}) + img = img.get('src').split('base64,')[-1] + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py index 98ddf3a..7ab139d 100644 --- a/Forums/Cardingleaks/parser.py +++ b/Forums/Cardingleaks/parser.py @@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] li = soup.find("h1", {"class": "p-title-value"}) topic = cleanString(li.text.strip()) @@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag): datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) - + + img = ipost.find('div', {"class": "message-avatar"}).find('img') + img = img.get('src').split('base64,')[-1] + image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index bcef5f8..bfe4403 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -25,6 +25,7 @@ def cryptBB_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] # Finding the topic (should be just one coming from the Listing Page) @@ -155,6 +156,10 @@ def cryptBB_description_parser(soup): feedback.append("-1") + img = ipost.find('div', {"class": "author_avatar"}).find('img') + img = img.get('src').split('base64,')[-1] + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py index 16b56cb..e42ace8 100644 --- a/Forums/HiddenAnswers/parser.py +++ b/Forums/HiddenAnswers/parser.py @@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post) post: List[str] = [] # all messages of each post interest: List[str] = [] # all user's interest in each post - + image_user = [] # Finding the topic (should be just one coming from the Listing Page) li = soup.find("h1").find("span", {"itemprop": "name"}) @@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): feedback.append("-1") sign.append("-1") interest.append("-1") - + + img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"}) @@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): sign.append("-1") interest.append("-1") + img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py index 3854141..e0c780a 100644 --- a/Forums/OnniForums/parser.py +++ b/Forums/OnniForums/parser.py @@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup): body = thread.find("span",{"class": "subject_new"}) try: post_subject: str = body.text #getting the topic - - except AttributeError: + except: body = thread.find("span",{"class": "subject_old"}) post_subject: str = body.text @@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup): reply_count = thread.find_all("td", {"align": "center"})[2].text - post.append(reply_count) + post.append(cleanNumbers(reply_count)) views = thread.find_all("td", {"align": "center"})[3].text - view.append(views) + view.append(cleanNumbers(views)) # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text # dates_added_cleaned = dates_added.split(',')[0] diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 2c2d89f..e7afcb8 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate): return correct_date +def cleanNumbers(inputString): + + reg_ex = re.compile(r'[^\d.]+') + updated_string = reg_ex.sub('', inputString) + + return updated_string + + def aes_encryption(item): to_bytes = bytes(item) diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py index b7a4f63..8cd3a5b 100644 --- a/MarketPlaces/Apocalypse/parser.py +++ b/MarketPlaces/Apocalypse/parser.py @@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag): product_price = prod.find("span", {"class": "priceP"}).text USD.append(cleanString(product_price.strip())) - - product_sold = prod.find("span", {"class": "badge badge-success"}).text sold.append(cleanString(product_sold.strip())) @@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag): # When split by the star (★), it should return a 2-value array product_vendor, product_vendor_rating = product_vendor_tag.text.split("★") - try: vendor.append(cleanString(product_vendor.strip())) rating.append(cleanString(product_vendor_rating.strip())) @@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag): href.append(product_href) nm += 1 - - + return organizeProducts( marketplace=mktName, nm=nm, @@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag): image_vendor=image_vendor ) + #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py index 0f87696..e5af35b 100644 --- a/MarketPlaces/GoFish/crawler_selenium.py +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - # opentor() mktName = getMKTName() driver = getAccess() @@ -41,24 +40,11 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) new_parse(mktName, baseURL, True) -# Opens Tor Browser -#prompts for ENTER input to continue -def opentor(): - from MarketPlaces.Initialization.markets_mining import config - - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - # Returns the name of the website #return: name of site in string type def getMKTName(): @@ -75,7 +61,7 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -102,7 +88,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -118,7 +104,7 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - # driver.maximize_window() + driver.maximize_window() return driver @@ -140,7 +126,6 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def login(driver): - input("Press ENTER when CAPTCHA is completed\n") # wait for page to show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -154,11 +139,12 @@ def login(driver): # Password here passwordBox.send_keys('DementedBed123-') - input("Press ENTER when CAPTCHA and exit pressed is completed\n") + input("Press ENTER when CAPTCHA is completed and logged in\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img"))) + (By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div"))) + # Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py index 6636e80..8560c57 100644 --- a/MarketPlaces/Torzon/crawler_selenium.py +++ b/MarketPlaces/Torzon/crawler_selenium.py @@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() mktName = getMKTName() driver = getAccess() @@ -44,25 +43,11 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) new_parse(mktName, BASE_URL, False) -# Opens Tor Browser -#prompts for ENTER input to continue -def opentor(): - from MarketPlaces.Initialization.markets_mining import config - - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - - # Returns the name of the website #return: name of site in string type def getMKTName(): @@ -79,7 +64,7 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -96,7 +81,6 @@ def createFFDriver(): ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) @@ -107,7 +91,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue # ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue - ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -123,6 +107,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -146,15 +132,13 @@ def login(driver): input("Press ENTER when CAPTCHA is completed and page is loaded\n") # wait for page to show up (This Xpath may need to change based on different seed url) + # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) - # filePath = getFullPathName("Hello") os.makedirs(os.path.dirname(filePath), exist_ok=True) - with open(filePath, 'wb') as file: - file.write(cleanPage.encode('utf-8')) - # open(filePath, 'wb').write(cleanPage.encode('utf-8')) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) return @@ -191,16 +175,16 @@ def getInterestedLinks(): links = [] # # services - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') - # # software & malware + # software & malware links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') # # fraud - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') # # guides - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089') return links @@ -227,27 +211,27 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: itemURL = urlparse.urljoin(BASE_URL, str(item)) try: - time.sleep(1.5) # to keep from detecting click speed + # time.sleep(1.5) # to keep from detecting click speed driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) - time.sleep(1.5) + savePage(driver, driver.page_source, item) + # time.sleep(1.5) driver.back() # to keep from detecting click speed - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')