diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 659e456..2c2d89f 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -195,12 +195,16 @@ def cleanLink(originalLink): def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate): - day = time.strftime("%m/%d/%Y") - ahora = time.strftime("%I:%M:%S") - rw = [] + current_time = datetime.now() + day = current_time.strftime("%m/%d/%Y") + for n in range(nm): + + current_time += timedelta(seconds=2) + ahora = current_time.strftime("%I:%M:%S") + lne = forum # 0 lne += "," lne += board # 1 @@ -400,19 +404,19 @@ def cleanHTML(driver, html): ] # remove images - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) for fmat in formats: - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # remove JavaScript - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # image and JavaScript - clean_html = re.sub(r"]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html) + clean_html = re.sub(r"]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html) return clean_html diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py index 42d8e49..e5f5a3d 100644 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -159,9 +159,8 @@ def getNameFromURL(url): #as you can see they are categories of products def getInterestedLinks(): links = [] - # # Software - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares') - # # Malware + + # Malware links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') # # Bootkits # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') @@ -195,6 +194,8 @@ def getInterestedLinks(): # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') # # Security # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') + # # Ransomware + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') return links diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py index c5c7f6d..997d43e 100644 --- a/MarketPlaces/AnonMarket/parser.py +++ b/MarketPlaces/AnonMarket/parser.py @@ -49,26 +49,29 @@ def AnonMarket_description_parser(soup): info_div = soup.find('div', {'class': 'information'}) table = info_div.find('table') if info_div else None - if table: - # Find all table rows - rows = table.find_all('tr') - - # Parse each row to get relevant data - data = {} - for row in rows: - columns = row.find_all('td') - if len(columns) == 3: - key = columns[0].text.strip() - value = columns[2].text.strip() - data[key] = value - - # Extract specific data from the dictionary and assign them to individual variables - vendor = data.get('Vendor', '-1') - shipFrom = data.get('Location', '-1') - shipTo = data.get('Ships to', '-1') - category = data.get('Category', '-1') - USD = data.get('Price', '-1').split()[0] - left = data.get('Stock', '-1') + # Find all table rows + rows = table.find_all('tr') + + # Parse each row to get relevant data + data = {} + for row in rows: + columns = row.find_all('td') + if len(columns) == 3: + key = columns[0].text.strip() + value = columns[2].text.strip() + data[key] = value + + # Extract specific data from the dictionary and assign them to individual variables + vendor = data.get('Vendor', '-1') + shipFrom = data.get('Location', '-1') + shipTo = data.get('Ships to', '-1') + category = data.get('Category', '-1') + USD = data.get('Price', '-1').split()[0] + left = data.get('Stock', '-1') + + # image + image = soup.find('img', {"class": "bigthumbnail"}) + image = image.get('src').split('base64,')[-1] # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -111,51 +114,55 @@ def AnonMarket_listing_parser(soup): href = [] # 22 Product_Links base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + cat = soup.find("div", {'class': 'heading'}).text + products_list = soup.find_all('div', {'class': 'item'}) nm = 0 for product in products_list: - try: - name_of_product = product.find("div", {"class": "title"}).text.strip() - name.append(name_of_product) - - name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() - vendor.append(name_of_vendor) - - cat = soup.find("div", {'class': 'heading'}).text - category.append(cat) - - product_link_element = product.find("div", {"class": "title"}).find_parent('a') - if product_link_element: - link = product_link_element['href'] - if "/product/" in link and "/user/" not in link: - full_link = base_url + link - href.append(full_link) - else: - href.append("-1") - else: - href.append("-1") - - # Append '-1' for unavailable data - rating_vendor.append("-1") - success.append("-1") - CVE.append("-1") - MS.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - BTC.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - - nm += 1 - - except AttributeError as e: - print("I'm somewhere I don't belong. I'm going to leave") - continue + name_of_product = product.find("div", {"class": "title"}).text.strip() + name.append(name_of_product) + + name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() + vendor.append(name_of_vendor) + + category.append(cat) + + tbody = product.find('div', {"class": "info"}).find('tbody') + + # rating_item + width = tbody.find('div', {"class": "stars2"}).get('style') + rating_item.append(cleanNumbers(width.strip())) + + tr = tbody.findAll('tr', recursive=False) + td = tr[2].findAll('td') + + # sold + sold.append(td[0].text.strip()) + + # reviews + reviews.append(td[1].text.strip()) + + product_link_element = product.find("div", {"class": "title"}).find_parent('a') + link = product_link_element['href'] + full_link = base_url + link + href.append(full_link) + + # Append '-1' for unavailable data + rating_vendor.append("-1") + success.append("-1") + CVE.append("-1") + MS.append("-1") + describe.append("-1") + views.append("-1") + addDate.append("-1") + BTC.append("-1") + USD.append("-1") + EURO.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + + nm += 1 # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index eb4d996..74b1be5 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -266,7 +266,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - #decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0][20]) if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or @@ -332,7 +332,7 @@ def create_items(cur, row, marketId, vendorId): 'shippedto_item': row[19] if row[19] != '-1' else None, 'dateinserted_item': row[23], 'lastseen_item': row[23], - 'image_item': row[20], + 'image_item': row[20] if row[20] != '-1' else None, 'itemId': itemId}) diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index fb9b122..77312f6 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -8,8 +8,6 @@ import base64 import io import configparser from datetime import datetime, timedelta -import datetime as fulldatetime -from bs4 import BeautifulSoup from lxml import html as lxml from selenium.webdriver.common.by import By from Crypto.Cipher import AES @@ -246,11 +244,14 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom rw = [] - day = time.strftime("%m/%d/%Y") - ahora = time.strftime("%I:%M:%S") + current_time = datetime.now() + day = current_time.strftime("%m/%d/%Y") for n in range(nm): + current_time += timedelta(seconds=2) + ahora = current_time.strftime("%I:%M:%S") + lne = marketplace # 0 lne += "," lne += vendor[n] # 1 @@ -422,19 +423,19 @@ def cleanHTML(driver, html): ] # remove images - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) for fmat in formats: - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # remove JavaScript - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # image and JavaScript - clean_html = re.sub(r"]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html) + clean_html = re.sub(r"]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html) return clean_html