Browse Source

fixed change tracking bug and image tracking for AnonMarket

main
westernmeadow 1 year ago
parent
commit
d1d53d9b23
5 changed files with 103 additions and 90 deletions
  1. +15
    -11
      Forums/Utilities/utilities.py
  2. +4
    -3
      MarketPlaces/AnonMarket/crawler_selenium.py
  3. +69
    -62
      MarketPlaces/AnonMarket/parser.py
  4. +2
    -2
      MarketPlaces/DB_Connection/db_connection.py
  5. +13
    -12
      MarketPlaces/Utilities/utilities.py

+ 15
- 11
Forums/Utilities/utilities.py View File

@ -195,12 +195,16 @@ def cleanLink(originalLink):
def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):
day = time.strftime("%m/%d/%Y")
ahora = time.strftime("%I:%M:%S")
rw = []
current_time = datetime.now()
day = current_time.strftime("%m/%d/%Y")
for n in range(nm):
current_time += timedelta(seconds=2)
ahora = current_time.strftime("%I:%M:%S")
lne = forum # 0
lne += ","
lne += board # 1
@ -400,19 +404,19 @@ def cleanHTML(driver, html):
]
# remove images
clean_html = re.sub(r"<svg.*?>", "", clean_html)
clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
for fmat in formats:
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
clean_html = re.sub(r"<canvas.*?>", "", clean_html)
clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
# remove JavaScript
clean_html = re.sub(r"<script.*?>", "", clean_html)
clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
# image and JavaScript
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
return clean_html


+ 4
- 3
MarketPlaces/AnonMarket/crawler_selenium.py View File

@ -159,9 +159,8 @@ def getNameFromURL(url):
#as you can see they are categories of products
def getInterestedLinks():
links = []
# # Software
# links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares')
# # Malware
# Malware
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware')
# # Bootkits
# links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits')
@ -195,6 +194,8 @@ def getInterestedLinks():
# links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit')
# # Security
# links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security')
# # Ransomware
# links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware')
return links


+ 69
- 62
MarketPlaces/AnonMarket/parser.py View File

@ -49,26 +49,29 @@ def AnonMarket_description_parser(soup):
info_div = soup.find('div', {'class': 'information'})
table = info_div.find('table') if info_div else None
if table:
# Find all table rows
rows = table.find_all('tr')
# Parse each row to get relevant data
data = {}
for row in rows:
columns = row.find_all('td')
if len(columns) == 3:
key = columns[0].text.strip()
value = columns[2].text.strip()
data[key] = value
# Extract specific data from the dictionary and assign them to individual variables
vendor = data.get('Vendor', '-1')
shipFrom = data.get('Location', '-1')
shipTo = data.get('Ships to', '-1')
category = data.get('Category', '-1')
USD = data.get('Price', '-1').split()[0]
left = data.get('Stock', '-1')
# Find all table rows
rows = table.find_all('tr')
# Parse each row to get relevant data
data = {}
for row in rows:
columns = row.find_all('td')
if len(columns) == 3:
key = columns[0].text.strip()
value = columns[2].text.strip()
data[key] = value
# Extract specific data from the dictionary and assign them to individual variables
vendor = data.get('Vendor', '-1')
shipFrom = data.get('Location', '-1')
shipTo = data.get('Ships to', '-1')
category = data.get('Category', '-1')
USD = data.get('Price', '-1').split()[0]
left = data.get('Stock', '-1')
# image
image = soup.find('img', {"class": "bigthumbnail"})
image = image.get('src').split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -111,51 +114,55 @@ def AnonMarket_listing_parser(soup):
href = [] # 22 Product_Links
base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"
cat = soup.find("div", {'class': 'heading'}).text
products_list = soup.find_all('div', {'class': 'item'})
nm = 0
for product in products_list:
try:
name_of_product = product.find("div", {"class": "title"}).text.strip()
name.append(name_of_product)
name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
vendor.append(name_of_vendor)
cat = soup.find("div", {'class': 'heading'}).text
category.append(cat)
product_link_element = product.find("div", {"class": "title"}).find_parent('a')
if product_link_element:
link = product_link_element['href']
if "/product/" in link and "/user/" not in link:
full_link = base_url + link
href.append(full_link)
else:
href.append("-1")
else:
href.append("-1")
# Append '-1' for unavailable data
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
MS.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
nm += 1
except AttributeError as e:
print("I'm somewhere I don't belong. I'm going to leave")
continue
name_of_product = product.find("div", {"class": "title"}).text.strip()
name.append(name_of_product)
name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
vendor.append(name_of_vendor)
category.append(cat)
tbody = product.find('div', {"class": "info"}).find('tbody')
# rating_item
width = tbody.find('div', {"class": "stars2"}).get('style')
rating_item.append(cleanNumbers(width.strip()))
tr = tbody.findAll('tr', recursive=False)
td = tr[2].findAll('td')
# sold
sold.append(td[0].text.strip())
# reviews
reviews.append(td[1].text.strip())
product_link_element = product.find("div", {"class": "title"}).find_parent('a')
link = product_link_element['href']
full_link = base_url + link
href.append(full_link)
# Append '-1' for unavailable data
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
MS.append("-1")
describe.append("-1")
views.append("-1")
addDate.append("-1")
BTC.append("-1")
USD.append("-1")
EURO.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
nm += 1
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,


+ 2
- 2
MarketPlaces/DB_Connection/db_connection.py View File

@ -266,7 +266,7 @@ def create_items(cur, row, marketId, vendorId):
recset = cur.fetchall()
#decode_decrypt_image_in_base64(recset[0][20])
# decode_decrypt_image_in_base64(recset[0][20])
if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or
str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or
@ -332,7 +332,7 @@ def create_items(cur, row, marketId, vendorId):
'shippedto_item': row[19] if row[19] != '-1' else None,
'dateinserted_item': row[23],
'lastseen_item': row[23],
'image_item': row[20],
'image_item': row[20] if row[20] != '-1' else None,
'itemId': itemId})


+ 13
- 12
MarketPlaces/Utilities/utilities.py View File

@ -8,8 +8,6 @@ import base64
import io
import configparser
from datetime import datetime, timedelta
import datetime as fulldatetime
from bs4 import BeautifulSoup
from lxml import html as lxml
from selenium.webdriver.common.by import By
from Crypto.Cipher import AES
@ -246,11 +244,14 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom
rw = []
day = time.strftime("%m/%d/%Y")
ahora = time.strftime("%I:%M:%S")
current_time = datetime.now()
day = current_time.strftime("%m/%d/%Y")
for n in range(nm):
current_time += timedelta(seconds=2)
ahora = current_time.strftime("%I:%M:%S")
lne = marketplace # 0
lne += ","
lne += vendor[n] # 1
@ -422,19 +423,19 @@ def cleanHTML(driver, html):
]
# remove images
clean_html = re.sub(r"<svg.*?>", "", clean_html)
clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
for fmat in formats:
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
clean_html = re.sub(r"<canvas.*?>", "", clean_html)
clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
# remove JavaScript
clean_html = re.sub(r"<script.*?>", "", clean_html)
clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
# image and JavaScript
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
return clean_html


Loading…
Cancel
Save