From 0345836e20edacb80377d28781bb29a55e7dcb82 Mon Sep 17 00:00:00 2001
From: westernmeadow <wkwan626@gmail.com>
Date: Tue, 5 Sep 2023 17:59:33 -0700
Subject: [PATCH] user image tracking ONLY (missing post image) for some forums

---
 .idea/DW_Pipeline_Test.iml              |  1 +
 Forums/Altenens/parser.py               |  8 ++++
 Forums/BestCardingWorld/parser.py       |  5 ++
 Forums/Cardingleaks/parser.py           |  6 ++-
 Forums/CryptBB/parser.py                |  5 ++
 Forums/HiddenAnswers/parser.py          | 17 ++++++-
 Forums/OnniForums/parser.py             |  7 ++-
 Forums/Utilities/utilities.py           |  8 ++++
 MarketPlaces/Apocalypse/parser.py       |  7 +--
 MarketPlaces/GoFish/crawler_selenium.py | 28 +++--------
 MarketPlaces/Torzon/crawler_selenium.py | 62 +++++++++----------------
 11 files changed, 82 insertions(+), 72 deletions(-)
diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index f27dbb9..9ee2f4c 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -28,6 +28,7 @@
         <option value="$MODULE_DIR$/Forums/Libre" />
         <option value="$MODULE_DIR$/Forums/Procrax" />
         <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
+        <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
       </list>
     </option>
   </component>
diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py
index 19155d5..bdad19d 100644
--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@@ -22,6 +22,7 @@ def altenens_description_parser(soup):
     post = []               # 6 all messages of each post
     feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
     addDate = []            # 8 all dates of each post
+    image_user = []
 
     topic = soup.find("h1", {"class": "p-title-value"}).text
     topic = cleanString(topic.strip())
@@ -66,6 +67,13 @@ def altenens_description_parser(soup):
         date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
         addDate.append(date_time_obj)
 
+        img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
+        if img is not None:
+            img = img.get('src').split('base64,')[-1]
+        else:
+            img = "-1"
+        image_user.append(img)
+
     # Populate the final variable (this should be a list with all fields scraped)
 
     row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py
index c4ca6e0..5a294c6 100644
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup):
     sign = []              # 6 all user's signature in each post (usually a standard message after the content of the post)
     post = []              # 7 all messages of each post
     interest = []          # 8 all user's interest in each post
+    image_user = []
 
     # Finding the topic (should be just one coming from the Listing Page)
 
@@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup):
 
         feedback.append("-1")
 
+        img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"})
+        img = img.get('src').split('base64,')[-1]
+        image_user.append(img)
+
     # Populate the final variable (this should be a list with all fields scraped)
 
     row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py
index 98ddf3a..7ab139d 100644
--- a/Forums/Cardingleaks/parser.py
+++ b/Forums/Cardingleaks/parser.py
@@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag):
     post = []               # 6 all messages of each post
     feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
     addDate = []            # 8 all dates of each post
+    image_user = []
 
     li = soup.find("h1", {"class": "p-title-value"})
     topic = cleanString(li.text.strip())
@@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag):
         datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
         datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
         addDate.append(datetime_obj)
-        
+
+        img = ipost.find('div', {"class": "message-avatar"}).find('img')
+        img = img.get('src').split('base64,')[-1]
+        image_user.append(img)
         
     # Populate the final variable (this should be a list with all fields scraped)
 
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
index bcef5f8..bfe4403 100644
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@@ -25,6 +25,7 @@ def cryptBB_description_parser(soup):
     post = []               # 6 all messages of each post
     feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
     addDate = []            # 8 all dates of each post
+    image_user = []
 
     # Finding the topic (should be just one coming from the Listing Page)
 
@@ -155,6 +156,10 @@ def cryptBB_description_parser(soup):
 
         feedback.append("-1")
 
+        img = ipost.find('div', {"class": "author_avatar"}).find('img')
+        img = img.get('src').split('base64,')[-1]
+        image_user.append(img)
+
     # Populate the final variable (this should be a list with all fields scraped)
 
     row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 16b56cb..e42ace8 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
     sign: List[str] = []  # all user's signature in each post (usually a standard message after the content of the post)
     post: List[str] = []  # all messages of each post
     interest: List[str] = []  # all user's interest in each post
-
+    image_user = []
 
     # Finding the topic (should be just one coming from the Listing Page)
     li = soup.find("h1").find("span", {"itemprop": "name"})
@@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
     feedback.append("-1")
     sign.append("-1")
     interest.append("-1")
-    
+
+    img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
+    if img is not None:
+        img = img.get('src').split('base64,')[-1]
+    else:
+        img = "-1"
+    image_user.append(img)
     
     answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
     
@@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
         sign.append("-1")
         interest.append("-1")
 
+        img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
+        if img is not None:
+            img = img.get('src').split('base64,')[-1]
+        else:
+            img = "-1"
+        image_user.append(img)
+
     # Populate the final variable (this should be a list with all fields scraped)
 
     row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py
index 3854141..e0c780a 100644
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup):
         body = thread.find("span",{"class": "subject_new"})
         try:
             post_subject: str = body.text #getting the topic
-            
-        except AttributeError:
+        except:
             body = thread.find("span",{"class": "subject_old"})
             post_subject: str = body.text
             
@@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup):
         
         
         reply_count = thread.find_all("td", {"align": "center"})[2].text
-        post.append(reply_count)
+        post.append(cleanNumbers(reply_count))
         
         views = thread.find_all("td", {"align": "center"})[3].text
-        view.append(views)
+        view.append(cleanNumbers(views))
         
         # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
         # dates_added_cleaned = dates_added.split(',')[0]
diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py
index 2c2d89f..e7afcb8 100644
--- a/Forums/Utilities/utilities.py
+++ b/Forums/Utilities/utilities.py
@@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate):
     return correct_date
 
 
+def cleanNumbers(inputString):
+
+    reg_ex = re.compile(r'[^\d.]+')
+    updated_string = reg_ex.sub('', inputString)
+
+    return updated_string
+
+
 def aes_encryption(item):
 
     to_bytes = bytes(item)
diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py
index b7a4f63..8cd3a5b 100644
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag):
         product_price = prod.find("span", {"class": "priceP"}).text
         USD.append(cleanString(product_price.strip()))
         
-        
-        
         product_sold = prod.find("span", {"class": "badge badge-success"}).text
         sold.append(cleanString(product_sold.strip()))
         
@@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag):
         # When split by the star (★), it should return a 2-value array
         product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")
         
-        
         try:
             vendor.append(cleanString(product_vendor.strip()))
             rating.append(cleanString(product_vendor_rating.strip()))
@@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag):
         href.append(product_href)
         
         nm += 1
-    
-    
+
     return organizeProducts(
         marketplace=mktName,
         nm=nm,
@@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag):
         image_vendor=image_vendor
     )
 
+
 #called by the crawler to get description links on a listing page
 #@param: beautifulsoup object that is using the correct html page (listing page)
 #return: list of description links from a listing page
diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py
index 0f87696..e5af35b 100644
--- a/MarketPlaces/GoFish/crawler_selenium.py
+++ b/MarketPlaces/GoFish/crawler_selenium.py
@@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
-    # opentor()
     mktName = getMKTName()
     driver = getAccess()
 
@@ -41,24 +40,11 @@ def startCrawling():
             crawlForum(driver)
         except Exception as e:
             print(driver.current_url, e)
-        closetor(driver)
+        closeDriver(driver)
 
     new_parse(mktName, baseURL, True)
 
 
-# Opens Tor Browser
-#prompts for ENTER input to continue
-def opentor():
-    from MarketPlaces.Initialization.markets_mining import config
-
-    global pid
-    print("Connecting Tor...")
-    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
-    pid = pro.pid
-    time.sleep(7.5)
-    input('Tor Connected. Press ENTER to continue\n')
-    return
-
 # Returns the name of the website
 #return: name of site in string type
 def getMKTName():
@@ -75,7 +61,7 @@ def getFixedURL():
 
 # Closes Tor Browser
 #@param: current selenium driver
-def closetor(driver):
+def closeDriver(driver):
     # global pid
     # os.system("taskkill /pid " + str(pro.pid))
     # os.system("taskkill /t /f /im tor.exe")
@@ -102,7 +88,7 @@ def createFFDriver():
     ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
     # ff_prof.set_preference("network.dns.disablePrefetch", True)
     # ff_prof.set_preference("network.http.sendRefererHeader", 0)
-    ff_prof.set_preference("permissions.default.image", 1)
+    ff_prof.set_preference("permissions.default.image", 3)
     ff_prof.set_preference("browser.download.folderList", 2)
     ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
     ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@@ -118,7 +104,7 @@ def createFFDriver():
 
     driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
 
-    # driver.maximize_window()
+    driver.maximize_window()
 
     return driver
 
@@ -140,7 +126,6 @@ def getAccess():
 # then allows for manual solving of captcha in the terminal
 #@param: current selenium web driver
 def login(driver):
-    input("Press ENTER when CAPTCHA is completed\n")
 
     # wait for  page to show up (This Xpath may need to change based on different seed url)
     WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@@ -154,11 +139,12 @@ def login(driver):
     # Password here
     passwordBox.send_keys('DementedBed123-')
 
-    input("Press ENTER when CAPTCHA and exit pressed is completed\n")
+    input("Press ENTER when CAPTCHA is completed and logged in\n")
 
     # wait for listing page show up (This Xpath may need to change based on different seed url)
     WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
-        (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
+        (By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div")))
+
 
 # Saves the crawled html page, makes the directory path for html pages if not made
 def savePage(driver, page, url):
diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py
index 6636e80..8560c57 100644
--- a/MarketPlaces/Torzon/crawler_selenium.py
+++ b/MarketPlaces/Torzon/crawler_selenium.py
@@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
-    opentor()
     mktName = getMKTName()
     driver = getAccess()
 
@@ -44,25 +43,11 @@ def startCrawling():
             crawlForum(driver)
         except Exception as e:
             print(driver.current_url, e)
-        closetor(driver)
+        closeDriver(driver)
 
     new_parse(mktName, BASE_URL, False)
 
 
-# Opens Tor Browser
-#prompts for ENTER input to continue
-def opentor():
-    from MarketPlaces.Initialization.markets_mining import config
-
-    global pid
-    print("Connecting Tor...")
-    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
-    pid = pro.pid
-    time.sleep(7.5)
-    input('Tor Connected. Press ENTER to continue\n')
-    return
-
-
 # Returns the name of the website
 #return: name of site in string type
 def getMKTName():
@@ -79,7 +64,7 @@ def getFixedURL():
 
 # Closes Tor Browser
 #@param: current selenium driver
-def closetor(driver):
+def closeDriver(driver):
     # global pid
     # os.system("taskkill /pid " + str(pro.pid))
     # os.system("taskkill /t /f /im tor.exe")
@@ -96,7 +81,6 @@ def createFFDriver():
 
     ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
 
-
     ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
     ff_prof.set_preference("places.history.enabled", False)
     ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
@@ -107,7 +91,7 @@ def createFFDriver():
     ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
     # ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
     # ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
-    ff_prof.set_preference("permissions.default.image", 1)
+    ff_prof.set_preference("permissions.default.image", 3)
     ff_prof.set_preference("browser.download.folderList", 2)
     ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
     ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@@ -123,6 +107,8 @@ def createFFDriver():
 
     driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
 
+    driver.maximize_window()
+
     return driver
 
 
@@ -146,15 +132,13 @@ def login(driver):
     input("Press ENTER when CAPTCHA is completed and page is loaded\n")
     # wait for  page to show up (This Xpath may need to change based on different seed url)
 
+
 # Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(page, url):
-    cleanPage = cleanHTML(page)
+def savePage(driver, page, url):
+    cleanPage = cleanHTML(driver, page)
     filePath = getFullPathName(url)
-    # filePath = getFullPathName("Hello")
     os.makedirs(os.path.dirname(filePath), exist_ok=True)
-    with open(filePath, 'wb') as file:
-        file.write(cleanPage.encode('utf-8'))
-    # open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
     return
 
 
@@ -191,16 +175,16 @@ def getInterestedLinks():
     links = []
 
     # # services
-    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
+    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
 
-    # # software & malware
+    # software & malware
     links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
 
     # # fraud
-    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
+    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
 
     # # guides
-    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
+    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
 
     return links
 
@@ -227,27 +211,27 @@ def crawlForum(driver):
                 except:
                     driver.refresh()
                 html = driver.page_source
-                savePage(html, link)
+                savePage(driver, html, link)
 
                 list = productPages(html)
                 for item in list:
                     itemURL = urlparse.urljoin(BASE_URL, str(item))
                     try:
-                        time.sleep(1.5) # to keep from detecting click speed
+                        # time.sleep(1.5) # to keep from detecting click speed
                         driver.get(itemURL)
                     except:
                         driver.refresh()
-                    savePage(driver.page_source, item)
-                    time.sleep(1.5)
+                    savePage(driver, driver.page_source, item)
+                    # time.sleep(1.5)
                     driver.back()
                      # to keep from detecting click speed
 
-                #     # comment out
-                #     break
-                #
-                # # comment out
-                # if count == 1:
-                #     break
+                    # comment out
+                    break
+
+                # comment out
+                if count == 1:
+                    break
 
                 try:
                     # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')