Browse Source

user image tracking ONLY (missing post image) for some forums

main
westernmeadow 1 year ago
parent
commit
0345836e20
11 changed files with 82 additions and 72 deletions
  1. +1
    -0
      .idea/DW_Pipeline_Test.iml
  2. +8
    -0
      Forums/Altenens/parser.py
  3. +5
    -0
      Forums/BestCardingWorld/parser.py
  4. +5
    -1
      Forums/Cardingleaks/parser.py
  5. +5
    -0
      Forums/CryptBB/parser.py
  6. +15
    -2
      Forums/HiddenAnswers/parser.py
  7. +3
    -4
      Forums/OnniForums/parser.py
  8. +8
    -0
      Forums/Utilities/utilities.py
  9. +2
    -5
      MarketPlaces/Apocalypse/parser.py
  10. +7
    -21
      MarketPlaces/GoFish/crawler_selenium.py
  11. +23
    -39
      MarketPlaces/Torzon/crawler_selenium.py

+ 1
- 0
.idea/DW_Pipeline_Test.iml View File

@ -28,6 +28,7 @@
<option value="$MODULE_DIR$/Forums/Libre" />
<option value="$MODULE_DIR$/Forums/Procrax" />
<option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
<option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
</list>
</option>
</component>

+ 8
- 0
Forums/Altenens/parser.py View File

@ -22,6 +22,7 @@ def altenens_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = []
topic = soup.find("h1", {"class": "p-title-value"}).text
topic = cleanString(topic.strip())
@ -66,6 +67,13 @@ def altenens_description_parser(soup):
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj)
img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)


+ 5
- 0
Forums/BestCardingWorld/parser.py View File

@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup):
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 7 all messages of each post
interest = [] # 8 all user's interest in each post
image_user = []
# Finding the topic (should be just one coming from the Listing Page)
@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup):
feedback.append("-1")
img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"})
img = img.get('src').split('base64,')[-1]
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)


+ 5
- 1
Forums/Cardingleaks/parser.py View File

@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = []
li = soup.find("h1", {"class": "p-title-value"})
topic = cleanString(li.text.strip())
@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag):
datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
img = ipost.find('div', {"class": "message-avatar"}).find('img')
img = img.get('src').split('base64,')[-1]
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)


+ 5
- 0
Forums/CryptBB/parser.py View File

@ -25,6 +25,7 @@ def cryptBB_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = []
# Finding the topic (should be just one coming from the Listing Page)
@ -155,6 +156,10 @@ def cryptBB_description_parser(soup):
feedback.append("-1")
img = ipost.find('div', {"class": "author_avatar"}).find('img')
img = img.get('src').split('base64,')[-1]
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)


+ 15
- 2
Forums/HiddenAnswers/parser.py View File

@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post)
post: List[str] = [] # all messages of each post
interest: List[str] = [] # all user's interest in each post
image_user = []
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1").find("span", {"itemprop": "name"})
@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
feedback.append("-1")
sign.append("-1")
interest.append("-1")
img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
sign.append("-1")
interest.append("-1")
img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)


+ 3
- 4
Forums/OnniForums/parser.py View File

@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup):
body = thread.find("span",{"class": "subject_new"})
try:
post_subject: str = body.text #getting the topic
except AttributeError:
except:
body = thread.find("span",{"class": "subject_old"})
post_subject: str = body.text
@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup):
reply_count = thread.find_all("td", {"align": "center"})[2].text
post.append(reply_count)
post.append(cleanNumbers(reply_count))
views = thread.find_all("td", {"align": "center"})[3].text
view.append(views)
view.append(cleanNumbers(views))
# dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
# dates_added_cleaned = dates_added.split(',')[0]


+ 8
- 0
Forums/Utilities/utilities.py View File

@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate):
return correct_date
def cleanNumbers(inputString):
reg_ex = re.compile(r'[^\d.]+')
updated_string = reg_ex.sub('', inputString)
return updated_string
def aes_encryption(item):
to_bytes = bytes(item)


+ 2
- 5
MarketPlaces/Apocalypse/parser.py View File

@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag):
product_price = prod.find("span", {"class": "priceP"}).text
USD.append(cleanString(product_price.strip()))
product_sold = prod.find("span", {"class": "badge badge-success"}).text
sold.append(cleanString(product_sold.strip()))
@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag):
# When split by the star (★), it should return a 2-value array
product_vendor, product_vendor_rating = product_vendor_tag.text.split("")
try:
vendor.append(cleanString(product_vendor.strip()))
rating.append(cleanString(product_vendor_rating.strip()))
@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag):
href.append(product_href)
nm += 1
return organizeProducts(
marketplace=mktName,
nm=nm,
@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag):
image_vendor=image_vendor
)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page


+ 7
- 21
MarketPlaces/GoFish/crawler_selenium.py View File

@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -41,24 +40,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -75,7 +61,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -102,7 +88,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -118,7 +104,7 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
# driver.maximize_window()
driver.maximize_window()
return driver
@ -140,7 +126,6 @@ def getAccess():
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
input("Press ENTER when CAPTCHA is completed\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -154,11 +139,12 @@ def login(driver):
# Password here
passwordBox.send_keys('DementedBed123-')
input("Press ENTER when CAPTCHA and exit pressed is completed\n")
input("Press ENTER when CAPTCHA is completed and logged in\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
(By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):


+ 23
- 39
MarketPlaces/Torzon/crawler_selenium.py View File

@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
mktName = getMKTName()
driver = getAccess()
@ -44,25 +43,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, BASE_URL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -79,7 +64,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -96,7 +81,6 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
@ -107,7 +91,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
# ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -123,6 +107,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -146,15 +132,13 @@ def login(driver):
input("Press ENTER when CAPTCHA is completed and page is loaded\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
# filePath = getFullPathName("Hello")
os.makedirs(os.path.dirname(filePath), exist_ok=True)
with open(filePath, 'wb') as file:
file.write(cleanPage.encode('utf-8'))
# open(filePath, 'wb').write(cleanPage.encode('utf-8'))
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
@ -191,16 +175,16 @@ def getInterestedLinks():
links = []
# # services
links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # software & malware
# software & malware
links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # fraud
links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # guides
links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
# links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
return links
@ -227,27 +211,27 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(BASE_URL, str(item))
try:
time.sleep(1.5) # to keep from detecting click speed
# time.sleep(1.5) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
time.sleep(1.5)
savePage(driver, driver.page_source, item)
# time.sleep(1.5)
driver.back()
# to keep from detecting click speed
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try:
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')


Loading…
Cancel
Save