Browse Source

changed href_item/topic to primary key and added vendor/author, name/title, and classification to tracked change

main
westernmeadow 1 year ago
parent
commit
5836e26e37
6 changed files with 183 additions and 165 deletions
  1. +53
    -34
      Forums/DB_Connection/db_connection.py
  2. +25
    -84
      Forums/Initialization/prepare_parser.py
  3. +19
    -13
      Forums/Utilities/utilities.py
  4. +40
    -29
      MarketPlaces/DB_Connection/db_connection.py
  5. +35
    -5
      MarketPlaces/Initialization/prepare_parser.py
  6. +11
    -0
      MarketPlaces/Utilities/utilities.py

+ 53
- 34
Forums/DB_Connection/db_connection.py View File

@ -45,14 +45,14 @@ def verifyForum(cur, nameForum):
print (trace) print (trace)
def verifyTopic(cur, forumId, authorId, titleTopic):
def verifyTopic(cur, forumId, hrefTopic):
try: try:
cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE") cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE")
cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1",
{'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic})
cur.execute("select topic_id from topics where forum_id = %(forumId)s and href_topic = %(hrefTopic)s limit 1",
{'forumId': forumId, 'hrefTopic': hrefTopic})
recset = cur.fetchall() recset = cur.fetchall()
@ -260,7 +260,9 @@ def create_forum(cur, row, url):
def create_topic(cur, forumId, row, authorId): def create_topic(cur, forumId, row, authorId):
topicId = verifyTopic(cur, forumId, authorId, row[3])
hrefTopic = get_relative_url(row[6])
topicId = verifyTopic(cur, forumId, hrefTopic)
if not topicId: if not topicId:
topicId = int(getLastTopic(cur) + 1) topicId = int(getLastTopic(cur) + 1)
@ -275,13 +277,14 @@ def create_topic(cur, forumId, row, authorId):
"%s, %s, %s, %s, %s)" "%s, %s, %s, %s, %s)"
recset = [topicId, forumId, authorId, recset = [topicId, forumId, authorId,
row[3], row[1],
row[3],
row[1],
row[4] if row[4] != '-1' else None, row[4] if row[4] != '-1' else None,
row[5] if row[5] != '-1' else None, row[5] if row[5] != '-1' else None,
row[6] if row[6] != '-1' else None,
hrefTopic,
row[7] if row[7] != '-1' else None, row[7] if row[7] != '-1' else None,
row[8], row[8],
row[19]]
row[19] if row[19] != '-1' else None]
cur.execute(sql, recset) cur.execute(sql, recset)
else: else:
@ -292,34 +295,50 @@ def create_topic(cur, forumId, row, authorId):
recset = cur.fetchall() recset = cur.fetchall()
if (str(recset[0]['board_topic']) != str(row[1]) or
str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None)):
if row[19] != '-1' and str(recset[0]['classification_topic']) == str(None):
sql = "Update topics set classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'classification_topic': row[19],
'topicId': topicId})
elif (str(recset[0]['author_id']) != str(authorId) or
str(recset[0]['title_topic']) != str(row[3]) or
str(recset[0]['board_topic']) != str(row[1]) or
str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['classification_topic']) != str(row[19] if row[19] != '-1' else recset[0]['classification_topic'])):
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \ "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s, %s)" "%s, %s, %s, %s, %s, %s, %s)"
recset = [topicId, topicVersionId, forumId, authorId,
recset[0]['title_topic'],
recset[0]['board_topic'],
recset[0]['views_topic'],
recset[0]['posts_topic'],
recset[0]['href_topic'],
recset[0]['dateadded_topic'],
recset[0]['dateinserted_topic'],
recset[0]['classification_topic']]
cur.execute(sql, recset)
sql = "Update topics set board_topic = %(board_topic)s, views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, " \
"dateinserted_topic = %(dateinserted_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'board_topic': row[1],
'views_topic': row[4] if row[4] != '-1' else None,
'posts_topic': row[5] if row[5] != '-1' else None,
'dateinserted_topic': row[8],
'topicId': topicId})
recset = [topicId, topicVersionId, forumId,
recset[0]['author_id'],
recset[0]['title_topic'],
recset[0]['board_topic'],
recset[0]['views_topic'],
recset[0]['posts_topic'],
recset[0]['href_topic'],
recset[0]['dateadded_topic'],
recset[0]['dateinserted_topic'],
recset[0]['classification_topic']]
cur.execute(sql, recset)
sql = "Update topics set author_id = %(author_id)s, title_topic = %(title_topic)s, board_topic = %(board_topic)s, " \
"views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, dateinserted_topic = %(dateinserted_topic)s, " \
"classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'author_id': authorId,
'title_topic': row[3] if row[3] != '-1' else None,
'board_topic': row[1] if row[1] != '-1' else None,
'views_topic': row[4] if row[4] != '-1' else None,
'posts_topic': row[5] if row[5] != '-1' else None,
'dateinserted_topic': row[8],
'classification_topic': row[19] if row[19] != '-1' else None,
'topicId': topicId})
return topicId return topicId
@ -548,21 +567,20 @@ def create_database(cur, con):
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision null, " \
"constraint topics_pk primary key (topic_id), " \ "constraint topics_pk primary key (topic_id), " \
"constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \ "constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \
"constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))" "constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql) cur.execute(sql)
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \
"title_topic ASC NULLS LAST)"
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, href_topic ASC NULLS LAST)"
cur.execute(sql) cur.execute(sql)
sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \ sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \
"author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \ "author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \
"views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \ "views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \
"dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \
"classification_topic double precision NOT NULL, " \
"classification_topic double precision null, " \
"constraint topics_history_pk primary key (topic_id, version_topic), " \ "constraint topics_history_pk primary key (topic_id, version_topic), " \
"constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ "constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \ "constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \
@ -602,4 +620,5 @@ def create_database(cur, con):
if (trace.find("already exists")==-1): if (trace.find("already exists")==-1):
print ("There was a problem during the database creation." ) print ("There was a problem during the database creation." )
traceback.print_exc()
raise SystemExit raise SystemExit

+ 25
- 84
Forums/Initialization/prepare_parser.py View File

@ -8,6 +8,8 @@ from psycopg2.extras import RealDictCursor
from Forums.DB_Connection.db_connection import * from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import * from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
from Forums.Incogsnoo.parser import *
from Forums.Classifier.classify_product import predict from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -116,6 +118,10 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
if forum == "BestCardingWorld": if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup) rw = bestcardingworld_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "Incogsnoo":
rw = incogsnoo_listing_parser(soup)
else: else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception
@ -139,6 +145,10 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
if forum == "BestCardingWorld": if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup) rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "Incogsnoo":
rmm = incogsnoo_description_parser(soup)
else: else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception
@ -224,50 +234,6 @@ def new_parse(forum, url, createLog):
try: try:
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w")
except: except:
try:
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
continue
try:
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks":
rmm = cardingleaks_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums":
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# check if "page1" exists at the end of a string
# if yes add to first page directory if no add to other
check = re.compile(r'page1$')
if check.search(key):
# print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
else:
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
print("Could not open log file!") print("Could not open log file!")
createLog = False createLog = False
logFile = None logFile = None
@ -277,6 +243,9 @@ def new_parse(forum, url, createLog):
# Reading the Listing Html Pages # Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
listings.sort(key=os.path.getmtime)
for listingIndex, listingFile in enumerate(listings): for listingIndex, listingFile in enumerate(listings):
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str(
@ -304,44 +273,6 @@ def new_parse(forum, url, createLog):
if doDescription: if doDescription:
if not readError:
parseError = False
try:
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks":
rw = cardingleaks_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums":
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
except:
nError += 1
print("There was a problem to read the file " + line1 + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
parseError = True
if not parseError:
persistError = False
moveError = False
num_in_db = 0
num_persisted_moved = 0
nFound = 0 nFound = 0
for rec in rw: for rec in rw:
@ -353,8 +284,13 @@ def new_parse(forum, url, createLog):
# Reading the associated description Html Pages # Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
descriptions.sort(key=os.path.getmtime)
nFound += len(descriptions) nFound += len(descriptions)
# Aggregate of posts from multiple description (topic) pages
posts = []
for descriptionIndex, descriptionFile in enumerate(descriptions): for descriptionIndex, descriptionFile in enumerate(descriptions):
print("Reading description folder of '" + forum + "', file '" + os.path.basename( print("Reading description folder of '" + forum + "', file '" + os.path.basename(
@ -384,8 +320,13 @@ def new_parse(forum, url, createLog):
# Combining the information from Listing and Description Pages # Combining the information from Listing and Description Pages
rec = mergePages(rmm, rec) rec = mergePages(rmm, rec)
# Append to the list the classification of the topic
rec.append(str(predict(rec[3], getPosts(rec[15]), language='sup_english')))
# Add the page's posts to aggregate
posts += rec[15]
# Classify on final description page
if descriptionIndex == len(descriptions) - 1:
# classification for topic based on all posts from all pages
rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english'))
# Persisting the information in the database # Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)


+ 19
- 13
Forums/Utilities/utilities.py View File

@ -15,6 +15,7 @@ from selenium.webdriver.common.by import By
from Crypto.Cipher import AES from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad from Crypto.Util.Padding import pad, unpad
from PIL import Image from PIL import Image
from urllib.parse import urlsplit, urljoin
def generate_aes_key(): def generate_aes_key():
@ -205,21 +206,21 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate,
lne = forum # 0 lne = forum # 0
lne += "," lne += ","
lne += board # 1
lne += board # 1 board_topic
lne += "," lne += ","
lne += author[n] # 2 lne += author[n] # 2
lne += "," lne += ","
lne += topic[n] # 3
lne += topic[n] # 3 topic_title
lne += "," lne += ","
lne += "-1" if len(views) == 0 else views[n] # 4
lne += "-1" if len(views) == 0 else views[n] # 4 views_topic
lne += "," lne += ","
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += "-1" if len(posts) == 0 else posts[n] # 5 posts_topic
lne += "," lne += ","
lne += "-1" if len(href) == 0 else href[n] # 6
lne += "-1" if len(href) == 0 else href[n] # 6 href_topic
lne += "," lne += ","
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 dateadded_topic
lne += "," lne += ","
lne += day + " " + ahora # 8
lne += day + " " + ahora # 8 dateinserted_topic
lne += "," lne += ","
lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
lne += "," lne += ","
@ -240,6 +241,8 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate,
lne += "-1" # 17 dateadded_post lne += "-1" # 17 dateadded_post
lne += "," lne += ","
lne += "-1" # 18 image_post lne += "-1" # 18 image_post
lne += ","
lne += "-1" # 19 classification_post
rw.append(lne) rw.append(lne)
@ -427,11 +430,14 @@ def cleanHTML(driver, html):
return clean_html return clean_html
def get_relative_url(target_url):
# Use a dummy base URL to handle both absolute and relative URLs
base_url = "http://dummybaseurl.com/"
absolute_url = urljoin(base_url, target_url)
# Parse the absolute URL
parsed_absolute_url = urlsplit(absolute_url)
# Extract the path and query from the absolute URL as the relative URL
return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
if parsed_absolute_url.query else parsed_absolute_url.path

+ 40
- 29
MarketPlaces/DB_Connection/db_connection.py View File

@ -67,14 +67,14 @@ def verifyVendor(cur, nameVendor, marketId):
trace = traceback.format_exc() trace = traceback.format_exc()
print (trace) print (trace)
def verifyItem(cur, marketId, vendorId, nameItem):
def verifyItem(cur, marketId, hrefItem):
try: try:
cur.execute("lock table items IN ACCESS EXCLUSIVE MODE") cur.execute("lock table items IN ACCESS EXCLUSIVE MODE")
cur.execute("select item_id from items where market_id = %(marketId)s and vendor_id = %(vendorId)s and name_item = %(nameItem)s"
"limit 1", {'marketId': marketId, 'vendorId': vendorId, 'nameItem': nameItem})
cur.execute("select item_id from items where market_id = %(marketId)s and href_item = %(hrefItem)s limit 1",
{'marketId': marketId, 'hrefItem': hrefItem})
recset = cur.fetchall() recset = cur.fetchall()
@ -263,7 +263,9 @@ def create_vendor(cur, row, marketId):
def create_items(cur, row, marketId, vendorId): def create_items(cur, row, marketId, vendorId):
itemId = verifyItem(cur, marketId, vendorId, row[4])
hrefItem = get_relative_url(row[22])
itemId = verifyItem(cur, marketId, hrefItem)
if not itemId: if not itemId:
itemId = int(getLastItem(cur) + 1) itemId = int(getLastItem(cur) + 1)
@ -277,7 +279,7 @@ def create_items(cur, row, marketId, vendorId):
sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, image_item, href_item, lastseen_item, dateinserted_item, " \
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s)" "%s, %s, %s, %s, %s)"
@ -298,11 +300,11 @@ def create_items(cur, row, marketId, vendorId):
row[17] if row[17] != '-1' else None, row[17] if row[17] != '-1' else None,
row[18] if row[18] != '-1' else None, row[18] if row[18] != '-1' else None,
row[19] if row[19] != '-1' else None, row[19] if row[19] != '-1' else None,
row[23],
row[20] if row[20] != '-1' else None, row[20] if row[20] != '-1' else None,
row[22] if row[22] != '-1' else None,
hrefItem,
row[23],
row[23], row[23],
row[24]]
row[24] if row[24] != '-1' else None]
cur.execute(sql, recset) cur.execute(sql, recset)
@ -316,7 +318,9 @@ def create_items(cur, row, marketId, vendorId):
# decode_decrypt_image_in_base64(recset[0]['image_item']) # decode_decrypt_image_in_base64(recset[0]['image_item'])
if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
if (str(recset[0]['vendor_id']) != str(vendorId) or
str(recset[0]['name_item']) != str(row[4] if row[4] != '-1' else None) or
str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or
str(recset[0]['ms_item']) != str(row[7] if row[7] != '-1' else None) or str(recset[0]['ms_item']) != str(row[7] if row[7] != '-1' else None) or
str(recset[0]['category_item']) != str(row[8] if row[8] != '-1' else None) or str(recset[0]['category_item']) != str(row[8] if row[8] != '-1' else None) or
@ -331,7 +335,8 @@ def create_items(cur, row, marketId, vendorId):
str(recset[0]['quantityleft_item']) != str(row[17] if row[17] != '-1' else None) or str(recset[0]['quantityleft_item']) != str(row[17] if row[17] != '-1' else None) or
str(recset[0]['shippedfrom_item']) != str(row[18] if row[18] != '-1' else None) or str(recset[0]['shippedfrom_item']) != str(row[18] if row[18] != '-1' else None) or
str(recset[0]['shippedto_item']) != str(row[19] if row[19] != '-1' else None) or str(recset[0]['shippedto_item']) != str(row[19] if row[19] != '-1' else None) or
str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None)):
str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None) or
str(recset[0]['classification_item']) != str(row[24] if row[24] != '-1' else None)):
itemVersionId = int(getLastItemVersion(cur, itemId) + 1) itemVersionId = int(getLastItemVersion(cur, itemId) + 1)
@ -341,7 +346,8 @@ def create_items(cur, row, marketId, vendorId):
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s)" "%s, %s, %s, %s, %s, %s)"
recset = [itemId, itemVersionId, marketId, vendorId,
recset = [itemId, itemVersionId, marketId,
recset[0]['vendor_id'],
recset[0]['name_item'], recset[0]['name_item'],
recset[0]['description_item'], recset[0]['description_item'],
recset[0]['cve_item'], recset[0]['cve_item'],
@ -358,22 +364,26 @@ def create_items(cur, row, marketId, vendorId):
recset[0]['quantityleft_item'], recset[0]['quantityleft_item'],
recset[0]['shippedfrom_item'], recset[0]['shippedfrom_item'],
recset[0]['shippedto_item'], recset[0]['shippedto_item'],
recset[0]['lastseen_item'],
recset[0]['image_item'], recset[0]['image_item'],
recset[0]['href_item'], recset[0]['href_item'],
recset[0]['lastseen_item'],
recset[0]['dateinserted_item'], recset[0]['dateinserted_item'],
recset[0]['classification_item']] recset[0]['classification_item']]
cur.execute(sql, recset) cur.execute(sql, recset)
sql = "Update items set description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \
sql = "Update items set vendor_id = %(vendor_id)s, name_item = %(name_item)s, " \
"description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \
"category_item = %(category_item)s, views_item = %(views_item)s, reviews_item = %(reviews_item)s, " \ "category_item = %(category_item)s, views_item = %(views_item)s, reviews_item = %(reviews_item)s, " \
"rating_item = %(rating_item)s, dateadded_item = %(dateadded_item)s, btc_item = %(btc_item)s, " \ "rating_item = %(rating_item)s, dateadded_item = %(dateadded_item)s, btc_item = %(btc_item)s, " \
"usd_item = %(usd_item)s, euro_item = %(euro_item)s, quantitysold_item = %(quantitysold_item)s, " \ "usd_item = %(usd_item)s, euro_item = %(euro_item)s, quantitysold_item = %(quantitysold_item)s, " \
"quantityleft_item = %(quantityleft_item)s, shippedfrom_item = %(shippedfrom_item)s, shippedto_item = %(shippedto_item)s, " \ "quantityleft_item = %(quantityleft_item)s, shippedfrom_item = %(shippedfrom_item)s, shippedto_item = %(shippedto_item)s, " \
"lastseen_item = %(lastseen_item)s, image_item = %(image_item)s, dateinserted_item = %(dateinserted_item)s where item_id = %(itemId)s"
"image_item = %(image_item)s, lastseen_item = %(lastseen_item)s, dateinserted_item = %(dateinserted_item)s." \
"classification_item = %(classification_item)s where item_id = %(itemId)s"
cur.execute(sql, {'description_item': row[5] if row[5] != '-1' else None,
cur.execute(sql, {'vendor_id': vendorId,
'name_item': row[4] if row[4] != '-1' else None,
'description_item': row[5] if row[5] != '-1' else None,
'cve_item': row[6] if row[6] != '-1' else None, 'cve_item': row[6] if row[6] != '-1' else None,
'ms_item': row[7] if row[7] != '-1' else None, 'ms_item': row[7] if row[7] != '-1' else None,
'category_item': row[8] if row[8] != '-1' else None, 'category_item': row[8] if row[8] != '-1' else None,
@ -388,12 +398,12 @@ def create_items(cur, row, marketId, vendorId):
'quantityleft_item': row[17] if row[17] != '-1' else None, 'quantityleft_item': row[17] if row[17] != '-1' else None,
'shippedfrom_item': row[18] if row[18] != '-1' else None, 'shippedfrom_item': row[18] if row[18] != '-1' else None,
'shippedto_item': row[19] if row[19] != '-1' else None, 'shippedto_item': row[19] if row[19] != '-1' else None,
'dateinserted_item': row[23],
'lastseen_item': row[23],
'image_item': row[20] if row[20] != '-1' else None, 'image_item': row[20] if row[20] != '-1' else None,
'lastseen_item': row[23],
'dateinserted_item': row[23],
'classification_item': row[24] if row[24] != '-1' else None,
'itemId': itemId}) 'itemId': itemId})
else: #updating when was the last time the crawler saw that item else: #updating when was the last time the crawler saw that item
sql = "Update items set lastseen_item = %(lastseen_item)s where item_id = %(itemId)s" sql = "Update items set lastseen_item = %(lastseen_item)s where item_id = %(itemId)s"
@ -438,15 +448,15 @@ def create_database(cur, con):
sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \
"listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \ "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \
"CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
"CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
"CONSTRAINT marketplaces_status_pk PRIMARY KEY (market_id, date_inserted), " \
"CONSTRAINT marketplaces_status_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
cur.execute(sql) cur.execute(sql)
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
"constraint vendors_pk primary key (vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \
"market_id))"
"constraint vendors_pk primary key (vendor_id), " \
"constraint vendors_market_id_fk foreign key (market_id) references marketplaces (market_id))"
cur.execute(sql) cur.execute(sql)
sql = "create unique index unique_vendor ON vendors USING btree (market_id ASC NULLS LAST, name_vendor ASC NULLS LAST)" sql = "create unique index unique_vendor ON vendors USING btree (market_id ASC NULLS LAST, name_vendor ASC NULLS LAST)"
@ -467,14 +477,14 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_pk primary key (item_id), constraint " \
"varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \
"lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision null, constraint items_pk primary key (item_id), constraint " \
"items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \ "items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \
"items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id))" "items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id))"
cur.execute(sql) cur.execute(sql)
sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, vendor_id ASC NULLS LAST, name_item ASC NULLS LAST)"
sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, href_item ASC NULLS LAST)"
cur.execute(sql) cur.execute(sql)
sql = "create table items_history(item_id integer not null, version_item integer not null, market_id integer not null, " \ sql = "create table items_history(item_id integer not null, version_item integer not null, market_id integer not null, " \
@ -484,9 +494,9 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_history_pk primary key (item_id, version_item), " \
"varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \
"lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision null, constraint items_history_pk primary key (item_id, version_item), " \
"constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ "constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \
"constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \ "constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \
"constraint items_history_item_id_fkey foreign key (item_id) references items (item_id))" "constraint items_history_item_id_fkey foreign key (item_id) references items (item_id))"
@ -502,4 +512,5 @@ def create_database(cur, con):
if (trace.find("already exists")==-1): if (trace.find("already exists")==-1):
print ("There was a problem during the database creation." ) print ("There was a problem during the database creation." )
traceback.print_exc()
raise SystemExit raise SystemExit

+ 35
- 5
MarketPlaces/Initialization/prepare_parser.py View File

@ -10,6 +10,7 @@ from psycopg2.extras import RealDictCursor
from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import * from MarketPlaces.DarkFox.parser import *
from MarketPlaces.AnonymousMarketplace.parser import * from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.TheDarkMarket.parser import *
from MarketPlaces.ViceCity.parser import * from MarketPlaces.ViceCity.parser import *
from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.MikesGrandStore.parser import *
@ -18,7 +19,11 @@ from MarketPlaces.CityMarket.parser import *
from MarketPlaces.DarkBazar.parser import * from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.Sonanza.parser import * from MarketPlaces.Sonanza.parser import *
from MarketPlaces.Kingdom.parser import * from MarketPlaces.Kingdom.parser import *
from MarketPlaces.BlackPyramid.parser import *
from MarketPlaces.Quest.parser import *
from MarketPlaces.Ares.parser import * from MarketPlaces.Ares.parser import *
from MarketPlaces.CypherMarketplace.parser import *
from MarketPlaces.WeTheNorth.parser import *
from MarketPlaces.GoFish.parser import * from MarketPlaces.GoFish.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
@ -130,7 +135,7 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
elif marketPlace == "M00nkeyMarket": elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup) rw = m00nkey_listing_parser(soup)
elif marketPlace == "MikesGrandStore": elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
rw = MikesGrandStore_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket": elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup) rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "CityMarket": elif marketPlace == "CityMarket":
@ -143,6 +148,16 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = sonanza_listing_parser(soup) rw = sonanza_listing_parser(soup)
elif marketPlace == "Kingdom": elif marketPlace == "Kingdom":
rw = kingdom_listing_parser(soup) rw = kingdom_listing_parser(soup)
elif marketPlace == "BlackPyramid":
rw = blackpyramid_listing_parser(soup)
elif marketPlace == "Quest":
rw = quest_listing_parser(soup)
elif marketPlace == "CypherMarketplace":
rw = cyphermarketplace_listing_parser(soup)
elif marketPlace == "TheDarkMarket":
rw = darkmarket_listing_parser(soup)
elif marketPlace == "WeTheNorth":
rw = wethenorth_listing_parser(soup)
elif marketPlace == "GoFish": elif marketPlace == "GoFish":
rw = gofish_listing_parser(soup) rw = gofish_listing_parser(soup)
else: else:
@ -174,7 +189,7 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
elif marketPlace == "M00nkeyMarket": elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup) rmm = m00nkey_description_parser(soup)
elif marketPlace == "MikesGrandStore": elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
rmm = MikesGrandStore_description_parser(soup)
elif marketPlace == "PabloEscobarMarket": elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup) rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "CityMarket": elif marketPlace == "CityMarket":
@ -187,6 +202,16 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = sonanza_description_parser(soup) rmm = sonanza_description_parser(soup)
elif marketPlace == "Kingdom": elif marketPlace == "Kingdom":
rmm = kingdom_description_parser(soup) rmm = kingdom_description_parser(soup)
elif marketPlace == "BlackPyramid":
rmm = blackpyramid_description_parser(soup)
elif marketPlace == "Quest":
rmm = quest_description_parser(soup)
elif marketPlace == "CypherMarketplace":
rmm = cyphermarketplace_description_parser(soup)
elif marketPlace == "TheDarkMarket":
rmm = darkmarket_description_parser(soup)
elif marketPlace == "WeTheNorth":
rmm = wethenorth_description_parser(soup)
elif marketPlace == "GoFish": elif marketPlace == "GoFish":
rmm = gofish_description_parser(soup) rmm = gofish_description_parser(soup)
else: else:
@ -280,6 +305,9 @@ def new_parse(marketPlace, url, createLog):
# Reading the Listing Html Pages # Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
listings.sort(key=os.path.getmtime)
for listingIndex, listingFile in enumerate(listings): for listingIndex, listingFile in enumerate(listings):
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str( print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str(
@ -297,12 +325,12 @@ def new_parse(marketPlace, url, createLog):
moveDescriptionError = False moveDescriptionError = False
findDescriptionError = False findDescriptionError = False
rw = []
rw = []
if doParseListing: if doParseListing:
rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile) rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile)
doDescription = rw is not None doDescription = rw is not None
if doDescription: if doDescription:
@ -312,12 +340,14 @@ def new_parse(marketPlace, url, createLog):
for rec in rw: for rec in rw:
rec = rec.split(',') rec = rec.split(',')
descriptionPattern = cleanLink(rec[22]) + ".html" descriptionPattern = cleanLink(rec[22]) + ".html"
# Reading the associated description Html Pages # Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
descriptions.sort(key=os.path.getmtime)
nFound += len(descriptions) nFound += len(descriptions)
for descriptionIndex, descriptionFile in enumerate(descriptions): for descriptionIndex, descriptionFile in enumerate(descriptions):


+ 11
- 0
MarketPlaces/Utilities/utilities.py View File

@ -13,6 +13,7 @@ from selenium.webdriver.common.by import By
from Crypto.Cipher import AES from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad from Crypto.Util.Padding import pad, unpad
from PIL import Image from PIL import Image
from urllib.parse import urlsplit, urljoin
def generate_aes_key(): def generate_aes_key():
@ -439,4 +440,14 @@ def cleanHTML(driver, html):
return clean_html return clean_html
def get_relative_url(target_url):
# Use a dummy base URL to handle both absolute and relative URLs
base_url = "http://dummybaseurl.com/"
absolute_url = urljoin(base_url, target_url)
# Parse the absolute URL
parsed_absolute_url = urlsplit(absolute_url)
# Extract the path and query from the absolute URL as the relative URL
return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
if parsed_absolute_url.query else parsed_absolute_url.path

Loading…
Cancel
Save