Merge remote-tracking branch 'origin/main'

1 year ago · 7b552c8d70
--- a/MarketPlaces/GoFish/crawler_selenium.py
+++ b/MarketPlaces/GoFish/crawler_selenium.py
@ -118,7 +118,9 @@ def getAccess():
 def login(driver):
    input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="username"]')))
    # entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@ -131,6 +133,7 @@ def login(driver):
    submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
    submit.click()
    input("Press ENTER when CAPTCHA is complete and home page has loaded\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -175,17 +178,17 @@ def getInterestedLinks():
    links = []
    # Hosting and Security
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
    # Exploits and Kits
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
    # Botnets and Malware
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
    # Other Software
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
    # Hacking Guide
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
    # Fraud (mostly carding)
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
    return links
@ -222,12 +225,12 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()
                    # comment out
                    # break
                # comment out
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                    # break
                #     break
                try:
                    link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href')
--- a/MarketPlaces/MikesGrandStore/crawler_selenium.py
+++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py
@ -26,7 +26,7 @@ from MarketPlaces.MikesGrandStore.parser import MikesGrandStore_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML
 counter = 1
 baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion'
 baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/'
 def startCrawling():
@ -83,8 +83,8 @@ def createFFDriver():
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -159,11 +159,6 @@ def getFullPathName(url):
    return fullPath
 def getMKTName() -> str:
    name = 'MikesGrandStore'
    return name
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
@ -178,6 +173,10 @@ def getInterestedLinks():
    # Hacking
    links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/hacking/')
    # Carding
    links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/carding/')
    # Databases
    links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/databases/')
    return links
@ -215,11 +214,11 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()
                    # comment out
                    #break
                # comment out
                #if count == 1:
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #    break
                # go to next page
@ -260,7 +259,3 @@ def productPages(html):
 def crawler():
    startCrawling()
 if __name__ == '__main__':
    startCrawling()
--- a/MarketPlaces/MikesGrandStore/parser.py
+++ b/MarketPlaces/MikesGrandStore/parser.py
@ -38,55 +38,40 @@ def MikesGrandStore_description_parser(soup):
    # Finding Product Name
    name = soup.find('h1', {'class': 'product-title product_title entry-title'}).text
    name = name.replace('\n', ' ')
    name = name.replace(",", "")
    name = cleanString(name)
    name = name.strip()
    divmb = soup.findAll('div', {'class': "mb-1"})
    # Finding Vendor
    # no vendor
    vendor = "MikesGrandStore"
    # Finding the Product Rating
    rating_item = soup.find('strong', {'class', 'rating'}).text
    rating_item = rating_item.replace('\n', ' ')
    rating_item = rating_item.replace(",", "")
    rating_item = cleanNumbers(rating_item)
    rating_item = rating_item.strip()
    # Finding Number of Product Reviews
    review_container = soup.find('li', {'id': 'tab-title-reviews'})
    reviews = review_container.find('a').text
    reviews = reviews.replace('Reviews', '')
    reviews = reviews.replace('(', '')
    reviews = reviews.replace(')', '')
    reviews = reviews.replace('\n', ' ')
    reviews = reviews.replace(",", "")
    reviews = cleanNumbers(reviews)
    reviews = reviews.strip()
    # Finding Prices
    USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling
    USD = USD.replace('\n', ' ')
    USD = USD.replace(",", "")
    USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text
    USD = cleanNumbers(USD)
    USD = USD.strip()
    # Finding the Product Category
    cat_container = soup.find('span', {'class': 'posted_in'})
    cat = cat_container.findAll('a')
    category = ""
    for name in cat:
        category = category + " " + name.text
    # Finding the Product Quantity Available
    stock = soup.find('p', {'class': 'stock in-stock'})
    if stock is not None:
        left = stock.text
        left = left.replace("in stock", "")
        left = cleanNumbers(left)
        left = left.strip()
    # Finding the Product description
    desc_cont = soup.find('div', {'class': 'product-short-description'})
    describe = desc_cont.find('p').text.strip()
    describe = soup.find('div', {'id': 'tab-description'}).text
    describe = cleanString(describe)
    describe = describe.strip()
    # Finding Product Image
    image = soup.find('img', {'class': 'wp-post-image skip-lazy'})
@ -158,29 +143,19 @@ def MikesGrandStore_listing_parser(soup):
    nm = len(listing)
    for a in listing:
        bae = a.findAll('a', href=True)
        lb = a.findAll('div', {"id": "littlebox"})
        # Adding the url to the list of urls
        link = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).get('href')
        href.append(link)
        # Finding the Product
        product = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).text
        product = product.replace('\n', ' ')
        product = product.replace(",", "")
        product = product.replace("...", "")
        product = cleanString(product)
        product = product.strip()
        name.append(product)
        # Finding Product Image
        product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)
        # Finding Prices
        price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling
        price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text
        price = cleanNumbers(price)
        price = price.strip()
        USD.append(price)
@ -193,15 +168,16 @@ def MikesGrandStore_listing_parser(soup):
        # Finding the Category
        cat = a.find('p', {'class': 'category uppercase is-smaller no-text-overflow product-cat op-7'}).text
        cat = cat.replace("class:", "")
        cat = cleanString(cat)
        cat = cat.strip()
        category.append(cat)
        # Finding product rating
        rating = a.find('strong', {'class': 'rating'}).text
        rating = cleanNumbers(rating)
        rating = rating.strip()
        rating_item.append(rating)
        # Searching for CVE and MS categories
        cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
        if not cve:
@ -244,11 +220,6 @@ def MikesGrandStore_links_parser(soup):
    container = soup.find('div', {"class": "products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box"})
    listing = container.findAll('div', recursive=False)
    # for a in listing:
    #     bae = a.find('a', {"class": "text-info"}, href=True)
    #     link = bae['href']
    #     href.append(link)
    for a in listing:
        bae = a.findAll('a', href=True)
--- a/MarketPlaces/WeTheNorth/crawler_selenium.py
+++ b/MarketPlaces/WeTheNorth/crawler_selenium.py
@ -27,12 +27,12 @@ from MarketPlaces.Utilities.utilities import cleanHTML
 import selenium
 counter = 1
 baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
 baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/'
 # Opens Tor Browser, crawls the website
 def startCrawling():
    marketName = getMarketName()
    marketName = getMKTName()
    driver = getAccess()
    if driver != 'down':
@ -48,7 +48,6 @@ def startCrawling():
 # Login using premade account credentials and do login captcha manually
 def login(driver):
    time.sleep(3)
    #wait for login page
    input("Press ENTER when CAPTCHA is completed\n")
@ -60,32 +59,6 @@ def login(driver):
    #Password here
    passwordBox.send_keys('fishowal')
    # wait for captcha page show up
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
    # save captcha to local
    driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot(
        r'..\WeTheNorth\captcha.png')
    # This method will show image in any image viewer
    im = Image.open(r'..\WeTheNorth\captcha.png')
    im.show()
    # wait until input space show up
    inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input')
    # ask user input captcha solution in terminal
    userIn = input("Enter solution: ")
    # send user solution into the input space
    inputBox.send_keys(userIn)
    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
    input("Press ENTER when CAPTCHA is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)
@ -93,11 +66,7 @@ def login(driver):
        (By.XPATH, '//*[@id="information"]')))
 # Returns the name of the website
 def getMarketName():
    name = 'WeTheNorth'
    return name
 def getMKTName() -> str:
 def getMKTName():
    name = 'WeTheNorth'
    return name
@ -132,8 +101,8 @@ def createFFDriver():
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -201,10 +170,10 @@ def getNameFromURL(url):
 def getInterestedLinks():
    links = []
    # # Fraud Software
    # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
    # # Guides and Tutorials - Hacking
    # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
    # Fraud Software
    links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
    # Guides and Tutorials - Hacking
    links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
    # Software and Malware
    links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
@ -222,7 +191,6 @@ def crawlForum(driver):
        print('Crawling :', link)
        try:
            pg_counter = 1
            has_next_page = True
            count = 0
@ -244,13 +212,17 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()
                    # comment out
                    break
                # comment out
                if count == 1:
                    break
                try:
                    nav = driver.find_element(by=By.XPATH, value=
                        '/html/body/div[2]/div[3]/div[3]/div[2]/div[7]')
                    pg_counter += 1
                    pg_counter_str = "p=" + str(pg_counter) + "&"
                    a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]')
                    a = nav.find_element(by=By.XPATH, value=f'//a[contains(@href,"p={count + 2}&")]')
                    link = a.get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
@ -263,7 +235,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1
    input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n")
    print("Crawling WeTheNorth market done.")
 # Returns 'True' if the link is Topic link
--- a/MarketPlaces/WeTheNorth/parser.py
+++ b/MarketPlaces/WeTheNorth/parser.py
@ -31,39 +31,35 @@ def wethenorth_description_parser(soup):
    left = "-1"                         # 16 Product_QuantityLeft
    shipFrom = "-1"                     # 17 Product_ShippedFrom
    shipTo = "-1"                       # 18 Product_ShippedTo
    image = "-1"                        # 19 Product_Image
    vendor_image = "-1"                 # 20 Vendor_Image
    # Finding Product Name
    listDes = soup.find('div', {'class': "listDes"})
    name = listDes.find('h2').text
    name = name.replace('\n', ' ')
    name = name.replace(",", "")
    name = cleanString(name)
    name = name.strip()
    # Finding Vendor
    vendor = listDes.find('b').text
    vendor = vendor.replace(",", "")
    vendor = vendor.replace("...", "")
    vendor = vendor.replace("-", "")
    vendor = cleanString(vendor)
    vendor = vendor.strip()
    # Finding Vendor Rating
    rating = listDes.find('span',{'class':'levelSet'})
    rating = listDes.find('span', {'class': 'levelSet'})
    rating = rating.text
    rating = rating.replace('\n', ' ')
    rating = rating.replace(",", "")
    rating = rating.strip()
    # Finding Successful Transactions
    success = listDes.find_all('p')[1]
    success = success.find('span').text
    success = success.split()
    success = success[0].strip()
    rating = cleanNumbers(rating)
    rating_vendor = rating.strip()
    # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices
    padp = listDes.find('p',{'class':'padp'})
    padp = listDes.find('p', {'class': 'padp'})
    USD = padp.find('span').text
    USD = USD.strip()
    BTC = padp.find_next_sibling('p').text
    BTC = cleanNumbers(BTC)
    BTC = BTC.strip()
    # Finding Escrow - no escrow on WTN market
    shipping_info = listDes.find('tbody')
@ -73,47 +69,32 @@ def wethenorth_description_parser(soup):
        # Finding Shipment Information (Origin)
        shipFrom = row1[-1].text
        shipFrom=shipFrom.strip()
        if shipFrom=="":
            shipFrom="-1"
        shipFrom = cleanString(shipFrom)
        shipFrom = shipFrom.strip()
        if shipFrom == "":
            shipFrom = "-1"
        row2 = shipping_info[1].find_all('td')
        # Finding Shipment Information (Destination)
        shipTo = row2[-1].text
        shipTo= shipTo.strip()
        shipTo = cleanString(shipTo)
        shipTo = shipTo.strip()
        if shipTo == "":
            shipTo = "-1"
    # Finding the Product description
    describe = soup.find("div",{'class':'tabcontent'})
    describe = soup.find("div", {'class': 'tabcontent'})
    describe = describe.find('p').text
    describe = describe.replace("\n", " ")
    describe = describe.replace("\r", " ")
    describe = cleanString(describe)
    describe = describe.strip()
    # cannot find any tag for these
    '''
    # Finding the Number of Product Reviews
    tag = soup.findAll(text=re.compile('Reviews'))
    for index in tag:
        reviews = index
        par = reviews.find('(')
        if par >=0:
            reviews = reviews.replace("Reviews (","")
            reviews = reviews.replace(")","")
            reviews = reviews.split(",")
            review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
        else  :
            review = "-1"
    '''
    # Searching for CVE and MS categories
    # no CVE or MS for WTN market
    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
           BTC, USD, EURO, sold, left, shipFrom, shipTo)
           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
    # Sending the results
    return row
@ -144,7 +125,9 @@ def wethenorth_listing_parser(soup):
    qLeft =[]                                 # 17 Product_QuantityLeft
    shipFrom = []                             # 18 Product_ShippedFrom
    shipTo = []                               # 19 Product_ShippedTo
    href = []                                 # 20 Product_Links
    image = []                                # 20 Product_Image
    image_vendor = []                         # 21 Vendor_Image
    href = []                                 # 22 Product_Links
    right_content = soup.find('div', {"class": "right-content"})
    listing = right_content.findAll('div', {"class": "col-1search"})
@ -158,20 +141,19 @@ def wethenorth_listing_parser(soup):
        # Adding the url to the list of urls
        link = bae[0].get('href')
        link = cleanLink(link)
        href.append(link)
        # Finding the Vendor
        vendor_name = a.find('p', {'class': 'padp'})
        vendor_name = vendor_name.find('a').text
        vendor_name = vendor_name.replace(",", "")
        vendor_name = cleanString(vendor_name)
        vendor_name = vendor_name.strip()
        vendor.append(vendor_name)
        # Finding the Product
        product = bae[0].text
        product = product.replace('\n', ' ')
        product = product.replace(",", "")
        product = a.find('div', {'class': 'col-1centre'})
        product = product.find('div', {'class': 'head'}).find('a').text
        product = cleanString(product)
        product = product.strip()
        name.append(product)
@ -179,25 +161,56 @@ def wethenorth_listing_parser(soup):
        category_name = a.find('p', {'class': 'padp'}).text
        first_dash = category_name.find('-')
        second_dash = category_name[first_dash+1:].find('-')
        category_name = category_name[first_dash+1:second_dash]
        category_name=category_name.strip()
        category_name = category_name[first_dash+1: first_dash + second_dash]
        category_name = cleanString(category_name)
        category_name = category_name.strip()
        category.append(category_name)
        # Finding Success Transactions
        vendor_success = a.find('p', {'class': 'padp'}).text
        first_dash = vendor_success.find('(')
        vendor_success = vendor_success[first_dash + 1:]
        vendor_success = cleanNumbers(vendor_success)
        vendor_success = vendor_success.strip()
        success.append(vendor_success)
        # Finding Views
        view_count = a.text
        view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')]
        view_count = view_count.replace('Views:', ' ')
        view_count = view_count.replace('/', ' ')
        view_count = cleanNumbers(view_count)
        view_count = view_count.strip()
        views.append(view_count)
        # Finding success sales
        # Finding Quantity Sold
        sold_count = a.text
        sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')]
        sold_count = sold_count.replace('Sales:', ' ')
        sold_count = sold_count.replace('/', ' ')
        sold_count = cleanNumbers(sold_count)
        sold_count = sold_count.strip()
        success.append(sold_count)
        sold.append(sold_count)
        right = a.find('div', {'class': 'col-1right'})
        # Finding USD
        usd = right.find('a').text
        usd = "CAD " + usd.strip()
        USD.append(usd)
        # Finding BTC
        btc = right.text
        first_dash = btc.find('(')
        second_dash = btc[first_dash + 1:].find(')')
        btc = btc[first_dash + 1: first_dash + second_dash]
        btc = cleanNumbers(btc)
        btc = btc.strip()
        BTC.append(btc)
        # Finding Product Image
        product_image = right.find('img')
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)
        # Searching for CVE and MS categories
        # no CVE or MS in WTN market
@ -229,7 +242,7 @@ def wethenorth_listing_parser(soup):
    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
 def wethenorth_links_parser(soup):