diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index f27dbb9..9ee2f4c 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -28,6 +28,7 @@
+
diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py
index 19155d5..bdad19d 100644
--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@@ -22,6 +22,7 @@ def altenens_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
+ image_user = []
topic = soup.find("h1", {"class": "p-title-value"}).text
topic = cleanString(topic.strip())
@@ -66,6 +67,13 @@ def altenens_description_parser(soup):
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj)
+ img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
+ if img is not None:
+ img = img.get('src').split('base64,')[-1]
+ else:
+ img = "-1"
+ image_user.append(img)
+
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py
index c4ca6e0..5a294c6 100644
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup):
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 7 all messages of each post
interest = [] # 8 all user's interest in each post
+ image_user = []
# Finding the topic (should be just one coming from the Listing Page)
@@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup):
feedback.append("-1")
+ img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"})
+ img = img.get('src').split('base64,')[-1]
+ image_user.append(img)
+
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py
index 98ddf3a..7ab139d 100644
--- a/Forums/Cardingleaks/parser.py
+++ b/Forums/Cardingleaks/parser.py
@@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
+ image_user = []
li = soup.find("h1", {"class": "p-title-value"})
topic = cleanString(li.text.strip())
@@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag):
datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
-
+
+ img = ipost.find('div', {"class": "message-avatar"}).find('img')
+ img = img.get('src').split('base64,')[-1]
+ image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
index bcef5f8..bfe4403 100644
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@@ -25,6 +25,7 @@ def cryptBB_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
+ image_user = []
# Finding the topic (should be just one coming from the Listing Page)
@@ -155,6 +156,10 @@ def cryptBB_description_parser(soup):
feedback.append("-1")
+ img = ipost.find('div', {"class": "author_avatar"}).find('img')
+ img = img.get('src').split('base64,')[-1]
+ image_user.append(img)
+
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 16b56cb..e42ace8 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post)
post: List[str] = [] # all messages of each post
interest: List[str] = [] # all user's interest in each post
-
+ image_user = []
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1").find("span", {"itemprop": "name"})
@@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
feedback.append("-1")
sign.append("-1")
interest.append("-1")
-
+
+ img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
+ if img is not None:
+ img = img.get('src').split('base64,')[-1]
+ else:
+ img = "-1"
+ image_user.append(img)
answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
@@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
sign.append("-1")
interest.append("-1")
+ img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
+ if img is not None:
+ img = img.get('src').split('base64,')[-1]
+ else:
+ img = "-1"
+ image_user.append(img)
+
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py
index 3854141..e0c780a 100644
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup):
body = thread.find("span",{"class": "subject_new"})
try:
post_subject: str = body.text #getting the topic
-
- except AttributeError:
+ except:
body = thread.find("span",{"class": "subject_old"})
post_subject: str = body.text
@@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup):
reply_count = thread.find_all("td", {"align": "center"})[2].text
- post.append(reply_count)
+ post.append(cleanNumbers(reply_count))
views = thread.find_all("td", {"align": "center"})[3].text
- view.append(views)
+ view.append(cleanNumbers(views))
# dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
# dates_added_cleaned = dates_added.split(',')[0]
diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py
index 2c2d89f..e7afcb8 100644
--- a/Forums/Utilities/utilities.py
+++ b/Forums/Utilities/utilities.py
@@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate):
return correct_date
+def cleanNumbers(inputString):
+
+ reg_ex = re.compile(r'[^\d.]+')
+ updated_string = reg_ex.sub('', inputString)
+
+ return updated_string
+
+
def aes_encryption(item):
to_bytes = bytes(item)
diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py
index b7a4f63..8cd3a5b 100644
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag):
product_price = prod.find("span", {"class": "priceP"}).text
USD.append(cleanString(product_price.strip()))
-
-
product_sold = prod.find("span", {"class": "badge badge-success"}).text
sold.append(cleanString(product_sold.strip()))
@@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag):
# When split by the star (★), it should return a 2-value array
product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")
-
try:
vendor.append(cleanString(product_vendor.strip()))
rating.append(cleanString(product_vendor_rating.strip()))
@@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag):
href.append(product_href)
nm += 1
-
-
+
return organizeProducts(
marketplace=mktName,
nm=nm,
@@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag):
image_vendor=image_vendor
)
+
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py
index 0f87696..e5af35b 100644
--- a/MarketPlaces/GoFish/crawler_selenium.py
+++ b/MarketPlaces/GoFish/crawler_selenium.py
@@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
- # opentor()
mktName = getMKTName()
driver = getAccess()
@@ -41,24 +40,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
- closetor(driver)
+ closeDriver(driver)
new_parse(mktName, baseURL, True)
-# Opens Tor Browser
-#prompts for ENTER input to continue
-def opentor():
- from MarketPlaces.Initialization.markets_mining import config
-
- global pid
- print("Connecting Tor...")
- pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
- pid = pro.pid
- time.sleep(7.5)
- input('Tor Connected. Press ENTER to continue\n')
- return
-
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@@ -75,7 +61,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
-def closetor(driver):
+def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@@ -102,7 +88,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 1)
+ ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@@ -118,7 +104,7 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
- # driver.maximize_window()
+ driver.maximize_window()
return driver
@@ -140,7 +126,6 @@ def getAccess():
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
- input("Press ENTER when CAPTCHA is completed\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@@ -154,11 +139,12 @@ def login(driver):
# Password here
passwordBox.send_keys('DementedBed123-')
- input("Press ENTER when CAPTCHA and exit pressed is completed\n")
+ input("Press ENTER when CAPTCHA is completed and logged in\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
+ (By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div")))
+
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py
index 6636e80..8560c57 100644
--- a/MarketPlaces/Torzon/crawler_selenium.py
+++ b/MarketPlaces/Torzon/crawler_selenium.py
@@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
- opentor()
mktName = getMKTName()
driver = getAccess()
@@ -44,25 +43,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
- closetor(driver)
+ closeDriver(driver)
new_parse(mktName, BASE_URL, False)
-# Opens Tor Browser
-#prompts for ENTER input to continue
-def opentor():
- from MarketPlaces.Initialization.markets_mining import config
-
- global pid
- print("Connecting Tor...")
- pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
- pid = pro.pid
- time.sleep(7.5)
- input('Tor Connected. Press ENTER to continue\n')
- return
-
-
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@@ -79,7 +64,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
-def closetor(driver):
+def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@@ -96,7 +81,6 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
@@ -107,7 +91,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
# ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
- ff_prof.set_preference("permissions.default.image", 1)
+ ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@@ -123,6 +107,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -146,15 +132,13 @@ def login(driver):
input("Press ENTER when CAPTCHA is completed and page is loaded\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
+
# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(page, url):
- cleanPage = cleanHTML(page)
+def savePage(driver, page, url):
+ cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
- # filePath = getFullPathName("Hello")
os.makedirs(os.path.dirname(filePath), exist_ok=True)
- with open(filePath, 'wb') as file:
- file.write(cleanPage.encode('utf-8'))
- # open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
@@ -191,16 +175,16 @@ def getInterestedLinks():
links = []
# # services
- links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
+ # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
- # # software & malware
+ # software & malware
links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # fraud
- links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
+ # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # guides
- links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
+ # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
return links
@@ -227,27 +211,27 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
- savePage(html, link)
+ savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(BASE_URL, str(item))
try:
- time.sleep(1.5) # to keep from detecting click speed
+ # time.sleep(1.5) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
- savePage(driver.page_source, item)
- time.sleep(1.5)
+ savePage(driver, driver.page_source, item)
+ # time.sleep(1.5)
driver.back()
# to keep from detecting click speed
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')