user image tracking ONLY (missing post image) for some forums

1 year ago · 0345836e20
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@ -28,6 +28,7 @@
        <option value="$MODULE_DIR$/Forums/Libre" />
        <option value="$MODULE_DIR$/Forums/Procrax" />
        <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
        <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
      </list>
    </option>
  </component>
--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@ -22,6 +22,7 @@ def altenens_description_parser(soup):
    post = []               # 6 all messages of each post
    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
    addDate = []            # 8 all dates of each post
    image_user = []

    topic = soup.find("h1", {"class": "p-title-value"}).text
    topic = cleanString(topic.strip())
@ -66,6 +67,13 @@ def altenens_description_parser(soup):
        date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
        addDate.append(date_time_obj)

        img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
        if img is not None:
            img = img.get('src').split('base64,')[-1]
        else:
            img = "-1"
        image_user.append(img)

    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup):
    sign = []              # 6 all user's signature in each post (usually a standard message after the content of the post)
    post = []              # 7 all messages of each post
    interest = []          # 8 all user's interest in each post
    image_user = []

    # Finding the topic (should be just one coming from the Listing Page)

@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup):

        feedback.append("-1")

        img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"})
        img = img.get('src').split('base64,')[-1]
        image_user.append(img)

    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
--- a/Forums/Cardingleaks/parser.py
+++ b/Forums/Cardingleaks/parser.py
@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag):
    post = []               # 6 all messages of each post
    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
    addDate = []            # 8 all dates of each post
    image_user = []

    li = soup.find("h1", {"class": "p-title-value"})
    topic = cleanString(li.text.strip())
@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag):
        datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
        datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
        addDate.append(datetime_obj)
        

        img = ipost.find('div', {"class": "message-avatar"}).find('img')
        img = img.get('src').split('base64,')[-1]
        image_user.append(img)
        
    # Populate the final variable (this should be a list with all fields scraped)

--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@ -25,6 +25,7 @@ def cryptBB_description_parser(soup):
    post = []               # 6 all messages of each post
    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
    addDate = []            # 8 all dates of each post
    image_user = []

    # Finding the topic (should be just one coming from the Listing Page)

@ -155,6 +156,10 @@ def cryptBB_description_parser(soup):

        feedback.append("-1")

        img = ipost.find('div', {"class": "author_avatar"}).find('img')
        img = img.get('src').split('base64,')[-1]
        image_user.append(img)

    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
    sign: List[str] = []  # all user's signature in each post (usually a standard message after the content of the post)
    post: List[str] = []  # all messages of each post
    interest: List[str] = []  # all user's interest in each post

    image_user = []

    # Finding the topic (should be just one coming from the Listing Page)
    li = soup.find("h1").find("span", {"itemprop": "name"})
@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
    feedback.append("-1")
    sign.append("-1")
    interest.append("-1")
    

    img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
    if img is not None:
        img = img.get('src').split('base64,')[-1]
    else:
        img = "-1"
    image_user.append(img)
    
    answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
    
@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
        sign.append("-1")
        interest.append("-1")

        img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
        if img is not None:
            img = img.get('src').split('base64,')[-1]
        else:
            img = "-1"
        image_user.append(img)

    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup):
        body = thread.find("span",{"class": "subject_new"})
        try:
            post_subject: str = body.text #getting the topic
            
        except AttributeError:
        except:
            body = thread.find("span",{"class": "subject_old"})
            post_subject: str = body.text
            
@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup):
        
        
        reply_count = thread.find_all("td", {"align": "center"})[2].text
        post.append(reply_count)
        post.append(cleanNumbers(reply_count))
        
        views = thread.find_all("td", {"align": "center"})[3].text
        view.append(views)
        view.append(cleanNumbers(views))
        
        # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
        # dates_added_cleaned = dates_added.split(',')[0]
--- a/Forums/Utilities/utilities.py
+++ b/Forums/Utilities/utilities.py
@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate):
    return correct_date


 def cleanNumbers(inputString):

    reg_ex = re.compile(r'[^\d.]+')
    updated_string = reg_ex.sub('', inputString)

    return updated_string


 def aes_encryption(item):

    to_bytes = bytes(item)
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag):
        product_price = prod.find("span", {"class": "priceP"}).text
        USD.append(cleanString(product_price.strip()))
        
        
        
        product_sold = prod.find("span", {"class": "badge badge-success"}).text
        sold.append(cleanString(product_sold.strip()))
        
@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag):
        # When split by the star (★), it should return a 2-value array
        product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")
        
        
        try:
            vendor.append(cleanString(product_vendor.strip()))
            rating.append(cleanString(product_vendor_rating.strip()))
@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag):
        href.append(product_href)
        
        nm += 1
    
    

    return organizeProducts(
        marketplace=mktName,
        nm=nm,
@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag):
        image_vendor=image_vendor
    )


 #called by the crawler to get description links on a listing page
 #@param: beautifulsoup object that is using the correct html page (listing page)
 #return: list of description links from a listing page
--- a/MarketPlaces/GoFish/crawler_selenium.py
+++ b/MarketPlaces/GoFish/crawler_selenium.py
@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
    # opentor()
    mktName = getMKTName()
    driver = getAccess()

@ -41,24 +40,11 @@ def startCrawling():
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
        closeDriver(driver)

    new_parse(mktName, baseURL, True)


 # Opens Tor Browser
 #prompts for ENTER input to continue
 def opentor():
    from MarketPlaces.Initialization.markets_mining import config

    global pid
    print("Connecting Tor...")
    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return

 # Returns the name of the website
 #return: name of site in string type
 def getMKTName():
@ -75,7 +61,7 @@ def getFixedURL():

 # Closes Tor Browser
 #@param: current selenium driver
 def closetor(driver):
 def closeDriver(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
@ -102,7 +88,7 @@ def createFFDriver():
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 1)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -118,7 +104,7 @@ def createFFDriver():

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    # driver.maximize_window()
    driver.maximize_window()

    return driver

@ -140,7 +126,6 @@ def getAccess():
 # then allows for manual solving of captcha in the terminal
 #@param: current selenium web driver
 def login(driver):
    input("Press ENTER when CAPTCHA is completed\n")

    # wait for  page to show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -154,11 +139,12 @@ def login(driver):
    # Password here
    passwordBox.send_keys('DementedBed123-')

    input("Press ENTER when CAPTCHA and exit pressed is completed\n")
    input("Press ENTER when CAPTCHA is completed and logged in\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
        (By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div")))


 # Saves the crawled html page, makes the directory path for html pages if not made
 def savePage(driver, page, url):
--- a/MarketPlaces/Torzon/crawler_selenium.py
+++ b/MarketPlaces/Torzon/crawler_selenium.py
@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
    opentor()
    mktName = getMKTName()
    driver = getAccess()

@ -44,25 +43,11 @@ def startCrawling():
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
        closeDriver(driver)

    new_parse(mktName, BASE_URL, False)


 # Opens Tor Browser
 #prompts for ENTER input to continue
 def opentor():
    from MarketPlaces.Initialization.markets_mining import config

    global pid
    print("Connecting Tor...")
    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return


 # Returns the name of the website
 #return: name of site in string type
 def getMKTName():
@ -79,7 +64,7 @@ def getFixedURL():

 # Closes Tor Browser
 #@param: current selenium driver
 def closetor(driver):
 def closeDriver(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
@ -96,7 +81,6 @@ def createFFDriver():

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))


    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
@ -107,7 +91,7 @@ def createFFDriver():
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
    ff_prof.set_preference("permissions.default.image", 1)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -123,6 +107,8 @@ def createFFDriver():

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    driver.maximize_window()

    return driver


@ -146,15 +132,13 @@ def login(driver):
    input("Press ENTER when CAPTCHA is completed and page is loaded\n")
    # wait for  page to show up (This Xpath may need to change based on different seed url)


 # Saves the crawled html page, makes the directory path for html pages if not made
 def savePage(page, url):
    cleanPage = cleanHTML(page)
 def savePage(driver, page, url):
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    # filePath = getFullPathName("Hello")
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    with open(filePath, 'wb') as file:
        file.write(cleanPage.encode('utf-8'))
    # open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


@ -191,16 +175,16 @@ def getInterestedLinks():
    links = []

    # # services
    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')

    # # software & malware
    # software & malware
    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')

    # # fraud
    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')

    # # guides
    links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
    # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')

    return links

@ -227,27 +211,27 @@ def crawlForum(driver):
                except:
                    driver.refresh()
                html = driver.page_source
                savePage(html, link)
                savePage(driver, html, link)

                list = productPages(html)
                for item in list:
                    itemURL = urlparse.urljoin(BASE_URL, str(item))
                    try:
                        time.sleep(1.5) # to keep from detecting click speed
                        # time.sleep(1.5) # to keep from detecting click speed
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver.page_source, item)
                    time.sleep(1.5)
                    savePage(driver, driver.page_source, item)
                    # time.sleep(1.5)
                    driver.back()
                     # to keep from detecting click speed

                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break
                    # comment out
                    break

                # comment out
                if count == 1:
                    break

                try:
                    # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')