Browse Source

changed href_item/topic to primary key and added vendor/author, name/title, and classification to tracked change

main
westernmeadow 1 year ago
parent
commit
5836e26e37
6 changed files with 183 additions and 165 deletions
  1. +53
    -34
      Forums/DB_Connection/db_connection.py
  2. +25
    -84
      Forums/Initialization/prepare_parser.py
  3. +19
    -13
      Forums/Utilities/utilities.py
  4. +40
    -29
      MarketPlaces/DB_Connection/db_connection.py
  5. +35
    -5
      MarketPlaces/Initialization/prepare_parser.py
  6. +11
    -0
      MarketPlaces/Utilities/utilities.py

+ 53
- 34
Forums/DB_Connection/db_connection.py View File

@ -45,14 +45,14 @@ def verifyForum(cur, nameForum):
print (trace)
def verifyTopic(cur, forumId, authorId, titleTopic):
def verifyTopic(cur, forumId, hrefTopic):
try:
cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE")
cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1",
{'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic})
cur.execute("select topic_id from topics where forum_id = %(forumId)s and href_topic = %(hrefTopic)s limit 1",
{'forumId': forumId, 'hrefTopic': hrefTopic})
recset = cur.fetchall()
@ -260,7 +260,9 @@ def create_forum(cur, row, url):
def create_topic(cur, forumId, row, authorId):
topicId = verifyTopic(cur, forumId, authorId, row[3])
hrefTopic = get_relative_url(row[6])
topicId = verifyTopic(cur, forumId, hrefTopic)
if not topicId:
topicId = int(getLastTopic(cur) + 1)
@ -275,13 +277,14 @@ def create_topic(cur, forumId, row, authorId):
"%s, %s, %s, %s, %s)"
recset = [topicId, forumId, authorId,
row[3], row[1],
row[3],
row[1],
row[4] if row[4] != '-1' else None,
row[5] if row[5] != '-1' else None,
row[6] if row[6] != '-1' else None,
hrefTopic,
row[7] if row[7] != '-1' else None,
row[8],
row[19]]
row[19] if row[19] != '-1' else None]
cur.execute(sql, recset)
else:
@ -292,34 +295,50 @@ def create_topic(cur, forumId, row, authorId):
recset = cur.fetchall()
if (str(recset[0]['board_topic']) != str(row[1]) or
str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None)):
if row[19] != '-1' and str(recset[0]['classification_topic']) == str(None):
sql = "Update topics set classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'classification_topic': row[19],
'topicId': topicId})
elif (str(recset[0]['author_id']) != str(authorId) or
str(recset[0]['title_topic']) != str(row[3]) or
str(recset[0]['board_topic']) != str(row[1]) or
str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['classification_topic']) != str(row[19] if row[19] != '-1' else recset[0]['classification_topic'])):
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s, %s)"
recset = [topicId, topicVersionId, forumId, authorId,
recset[0]['title_topic'],
recset[0]['board_topic'],
recset[0]['views_topic'],
recset[0]['posts_topic'],
recset[0]['href_topic'],
recset[0]['dateadded_topic'],
recset[0]['dateinserted_topic'],
recset[0]['classification_topic']]
cur.execute(sql, recset)
sql = "Update topics set board_topic = %(board_topic)s, views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, " \
"dateinserted_topic = %(dateinserted_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'board_topic': row[1],
'views_topic': row[4] if row[4] != '-1' else None,
'posts_topic': row[5] if row[5] != '-1' else None,
'dateinserted_topic': row[8],
'topicId': topicId})
recset = [topicId, topicVersionId, forumId,
recset[0]['author_id'],
recset[0]['title_topic'],
recset[0]['board_topic'],
recset[0]['views_topic'],
recset[0]['posts_topic'],
recset[0]['href_topic'],
recset[0]['dateadded_topic'],
recset[0]['dateinserted_topic'],
recset[0]['classification_topic']]
cur.execute(sql, recset)
sql = "Update topics set author_id = %(author_id)s, title_topic = %(title_topic)s, board_topic = %(board_topic)s, " \
"views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, dateinserted_topic = %(dateinserted_topic)s, " \
"classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'author_id': authorId,
'title_topic': row[3] if row[3] != '-1' else None,
'board_topic': row[1] if row[1] != '-1' else None,
'views_topic': row[4] if row[4] != '-1' else None,
'posts_topic': row[5] if row[5] != '-1' else None,
'dateinserted_topic': row[8],
'classification_topic': row[19] if row[19] != '-1' else None,
'topicId': topicId})
return topicId
@ -548,21 +567,20 @@ def create_database(cur, con):
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision null, " \
"constraint topics_pk primary key (topic_id), " \
"constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \
"constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \
"title_topic ASC NULLS LAST)"
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, href_topic ASC NULLS LAST)"
cur.execute(sql)
sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \
"author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \
"views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \
"dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \
"classification_topic double precision NOT NULL, " \
"classification_topic double precision null, " \
"constraint topics_history_pk primary key (topic_id, version_topic), " \
"constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \
@ -602,4 +620,5 @@ def create_database(cur, con):
if (trace.find("already exists")==-1):
print ("There was a problem during the database creation." )
traceback.print_exc()
raise SystemExit

+ 25
- 84
Forums/Initialization/prepare_parser.py View File

@ -8,6 +8,8 @@ from psycopg2.extras import RealDictCursor
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
from Forums.Incogsnoo.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -116,6 +118,10 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "Incogsnoo":
rw = incogsnoo_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -139,6 +145,10 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "Incogsnoo":
rmm = incogsnoo_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -224,50 +234,6 @@ def new_parse(forum, url, createLog):
try:
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w")
except:
try:
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
continue
try:
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks":
rmm = cardingleaks_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums":
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# check if "page1" exists at the end of a string
# if yes add to first page directory if no add to other
check = re.compile(r'page1$')
if check.search(key):
# print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
else:
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
print("Could not open log file!")
createLog = False
logFile = None
@ -277,6 +243,9 @@ def new_parse(forum, url, createLog):
# Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
listings.sort(key=os.path.getmtime)
for listingIndex, listingFile in enumerate(listings):
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str(
@ -304,44 +273,6 @@ def new_parse(forum, url, createLog):
if doDescription:
if not readError:
parseError = False
try:
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks":
rw = cardingleaks_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums":
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
except:
nError += 1
print("There was a problem to read the file " + line1 + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
parseError = True
if not parseError:
persistError = False
moveError = False
num_in_db = 0
num_persisted_moved = 0
nFound = 0
for rec in rw:
@ -353,8 +284,13 @@ def new_parse(forum, url, createLog):
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
descriptions.sort(key=os.path.getmtime)
nFound += len(descriptions)
# Aggregate of posts from multiple description (topic) pages
posts = []
for descriptionIndex, descriptionFile in enumerate(descriptions):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(
@ -384,8 +320,13 @@ def new_parse(forum, url, createLog):
# Combining the information from Listing and Description Pages
rec = mergePages(rmm, rec)
# Append to the list the classification of the topic
rec.append(str(predict(rec[3], getPosts(rec[15]), language='sup_english')))
# Add the page's posts to aggregate
posts += rec[15]
# Classify on final description page
if descriptionIndex == len(descriptions) - 1:
# classification for topic based on all posts from all pages
rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english'))
# Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)


+ 19
- 13
Forums/Utilities/utilities.py View File

@ -15,6 +15,7 @@ from selenium.webdriver.common.by import By
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from PIL import Image
from urllib.parse import urlsplit, urljoin
def generate_aes_key():
@ -205,21 +206,21 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate,
lne = forum # 0
lne += ","
lne += board # 1
lne += board # 1 board_topic
lne += ","
lne += author[n] # 2
lne += ","
lne += topic[n] # 3
lne += topic[n] # 3 topic_title
lne += ","
lne += "-1" if len(views) == 0 else views[n] # 4
lne += "-1" if len(views) == 0 else views[n] # 4 views_topic
lne += ","
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += "-1" if len(posts) == 0 else posts[n] # 5 posts_topic
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 6
lne += "-1" if len(href) == 0 else href[n] # 6 href_topic
lne += ","
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 dateadded_topic
lne += ","
lne += day + " " + ahora # 8
lne += day + " " + ahora # 8 dateinserted_topic
lne += ","
lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
lne += ","
@ -240,6 +241,8 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate,
lne += "-1" # 17 dateadded_post
lne += ","
lne += "-1" # 18 image_post
lne += ","
lne += "-1" # 19 classification_post
rw.append(lne)
@ -427,11 +430,14 @@ def cleanHTML(driver, html):
return clean_html
def get_relative_url(target_url):
# Use a dummy base URL to handle both absolute and relative URLs
base_url = "http://dummybaseurl.com/"
absolute_url = urljoin(base_url, target_url)
# Parse the absolute URL
parsed_absolute_url = urlsplit(absolute_url)
# Extract the path and query from the absolute URL as the relative URL
return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
if parsed_absolute_url.query else parsed_absolute_url.path

+ 40
- 29
MarketPlaces/DB_Connection/db_connection.py View File

@ -67,14 +67,14 @@ def verifyVendor(cur, nameVendor, marketId):
trace = traceback.format_exc()
print (trace)
def verifyItem(cur, marketId, vendorId, nameItem):
def verifyItem(cur, marketId, hrefItem):
try:
cur.execute("lock table items IN ACCESS EXCLUSIVE MODE")
cur.execute("select item_id from items where market_id = %(marketId)s and vendor_id = %(vendorId)s and name_item = %(nameItem)s"
"limit 1", {'marketId': marketId, 'vendorId': vendorId, 'nameItem': nameItem})
cur.execute("select item_id from items where market_id = %(marketId)s and href_item = %(hrefItem)s limit 1",
{'marketId': marketId, 'hrefItem': hrefItem})
recset = cur.fetchall()
@ -263,7 +263,9 @@ def create_vendor(cur, row, marketId):
def create_items(cur, row, marketId, vendorId):
itemId = verifyItem(cur, marketId, vendorId, row[4])
hrefItem = get_relative_url(row[22])
itemId = verifyItem(cur, marketId, hrefItem)
if not itemId:
itemId = int(getLastItem(cur) + 1)
@ -277,7 +279,7 @@ def create_items(cur, row, marketId, vendorId):
sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, image_item, href_item, lastseen_item, dateinserted_item, " \
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s)"
@ -298,11 +300,11 @@ def create_items(cur, row, marketId, vendorId):
row[17] if row[17] != '-1' else None,
row[18] if row[18] != '-1' else None,
row[19] if row[19] != '-1' else None,
row[23],
row[20] if row[20] != '-1' else None,
row[22] if row[22] != '-1' else None,
hrefItem,
row[23],
row[23],
row[24]]
row[24] if row[24] != '-1' else None]
cur.execute(sql, recset)
@ -316,7 +318,9 @@ def create_items(cur, row, marketId, vendorId):
# decode_decrypt_image_in_base64(recset[0]['image_item'])
if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
if (str(recset[0]['vendor_id']) != str(vendorId) or
str(recset[0]['name_item']) != str(row[4] if row[4] != '-1' else None) or
str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or
str(recset[0]['ms_item']) != str(row[7] if row[7] != '-1' else None) or
str(recset[0]['category_item']) != str(row[8] if row[8] != '-1' else None) or
@ -331,7 +335,8 @@ def create_items(cur, row, marketId, vendorId):
str(recset[0]['quantityleft_item']) != str(row[17] if row[17] != '-1' else None) or
str(recset[0]['shippedfrom_item']) != str(row[18] if row[18] != '-1' else None) or
str(recset[0]['shippedto_item']) != str(row[19] if row[19] != '-1' else None) or
str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None)):
str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None) or
str(recset[0]['classification_item']) != str(row[24] if row[24] != '-1' else None)):
itemVersionId = int(getLastItemVersion(cur, itemId) + 1)
@ -341,7 +346,8 @@ def create_items(cur, row, marketId, vendorId):
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s)"
recset = [itemId, itemVersionId, marketId, vendorId,
recset = [itemId, itemVersionId, marketId,
recset[0]['vendor_id'],
recset[0]['name_item'],
recset[0]['description_item'],
recset[0]['cve_item'],
@ -358,22 +364,26 @@ def create_items(cur, row, marketId, vendorId):
recset[0]['quantityleft_item'],
recset[0]['shippedfrom_item'],
recset[0]['shippedto_item'],
recset[0]['lastseen_item'],
recset[0]['image_item'],
recset[0]['href_item'],
recset[0]['lastseen_item'],
recset[0]['dateinserted_item'],
recset[0]['classification_item']]
cur.execute(sql, recset)
sql = "Update items set description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \
sql = "Update items set vendor_id = %(vendor_id)s, name_item = %(name_item)s, " \
"description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \
"category_item = %(category_item)s, views_item = %(views_item)s, reviews_item = %(reviews_item)s, " \
"rating_item = %(rating_item)s, dateadded_item = %(dateadded_item)s, btc_item = %(btc_item)s, " \
"usd_item = %(usd_item)s, euro_item = %(euro_item)s, quantitysold_item = %(quantitysold_item)s, " \
"quantityleft_item = %(quantityleft_item)s, shippedfrom_item = %(shippedfrom_item)s, shippedto_item = %(shippedto_item)s, " \
"lastseen_item = %(lastseen_item)s, image_item = %(image_item)s, dateinserted_item = %(dateinserted_item)s where item_id = %(itemId)s"
"image_item = %(image_item)s, lastseen_item = %(lastseen_item)s, dateinserted_item = %(dateinserted_item)s." \
"classification_item = %(classification_item)s where item_id = %(itemId)s"
cur.execute(sql, {'description_item': row[5] if row[5] != '-1' else None,
cur.execute(sql, {'vendor_id': vendorId,
'name_item': row[4] if row[4] != '-1' else None,
'description_item': row[5] if row[5] != '-1' else None,
'cve_item': row[6] if row[6] != '-1' else None,
'ms_item': row[7] if row[7] != '-1' else None,
'category_item': row[8] if row[8] != '-1' else None,
@ -388,12 +398,12 @@ def create_items(cur, row, marketId, vendorId):
'quantityleft_item': row[17] if row[17] != '-1' else None,
'shippedfrom_item': row[18] if row[18] != '-1' else None,
'shippedto_item': row[19] if row[19] != '-1' else None,
'dateinserted_item': row[23],
'lastseen_item': row[23],
'image_item': row[20] if row[20] != '-1' else None,
'lastseen_item': row[23],
'dateinserted_item': row[23],
'classification_item': row[24] if row[24] != '-1' else None,
'itemId': itemId})
else: #updating when was the last time the crawler saw that item
sql = "Update items set lastseen_item = %(lastseen_item)s where item_id = %(itemId)s"
@ -438,15 +448,15 @@ def create_database(cur, con):
sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \
"listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \
"CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \
"CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
"CONSTRAINT marketplaces_status_pk PRIMARY KEY (market_id, date_inserted), " \
"CONSTRAINT marketplaces_status_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))"
cur.execute(sql)
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
"constraint vendors_pk primary key (vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \
"market_id))"
"constraint vendors_pk primary key (vendor_id), " \
"constraint vendors_market_id_fk foreign key (market_id) references marketplaces (market_id))"
cur.execute(sql)
sql = "create unique index unique_vendor ON vendors USING btree (market_id ASC NULLS LAST, name_vendor ASC NULLS LAST)"
@ -467,14 +477,14 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_pk primary key (item_id), constraint " \
"varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \
"lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision null, constraint items_pk primary key (item_id), constraint " \
"items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \
"items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id))"
cur.execute(sql)
sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, vendor_id ASC NULLS LAST, name_item ASC NULLS LAST)"
sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, href_item ASC NULLS LAST)"
cur.execute(sql)
sql = "create table items_history(item_id integer not null, version_item integer not null, market_id integer not null, " \
@ -484,9 +494,9 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_history_pk primary key (item_id, version_item), " \
"varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \
"lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision null, constraint items_history_pk primary key (item_id, version_item), " \
"constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \
"constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \
"constraint items_history_item_id_fkey foreign key (item_id) references items (item_id))"
@ -502,4 +512,5 @@ def create_database(cur, con):
if (trace.find("already exists")==-1):
print ("There was a problem during the database creation." )
traceback.print_exc()
raise SystemExit

+ 35
- 5
MarketPlaces/Initialization/prepare_parser.py View File

@ -10,6 +10,7 @@ from psycopg2.extras import RealDictCursor
from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.TheDarkMarket.parser import *
from MarketPlaces.ViceCity.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.MikesGrandStore.parser import *
@ -18,7 +19,11 @@ from MarketPlaces.CityMarket.parser import *
from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.Sonanza.parser import *
from MarketPlaces.Kingdom.parser import *
from MarketPlaces.BlackPyramid.parser import *
from MarketPlaces.Quest.parser import *
from MarketPlaces.Ares.parser import *
from MarketPlaces.CypherMarketplace.parser import *
from MarketPlaces.WeTheNorth.parser import *
from MarketPlaces.GoFish.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -130,7 +135,7 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
rw = MikesGrandStore_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "CityMarket":
@ -143,6 +148,16 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = sonanza_listing_parser(soup)
elif marketPlace == "Kingdom":
rw = kingdom_listing_parser(soup)
elif marketPlace == "BlackPyramid":
rw = blackpyramid_listing_parser(soup)
elif marketPlace == "Quest":
rw = quest_listing_parser(soup)
elif marketPlace == "CypherMarketplace":
rw = cyphermarketplace_listing_parser(soup)
elif marketPlace == "TheDarkMarket":
rw = darkmarket_listing_parser(soup)
elif marketPlace == "WeTheNorth":
rw = wethenorth_listing_parser(soup)
elif marketPlace == "GoFish":
rw = gofish_listing_parser(soup)
else:
@ -174,7 +189,7 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
rmm = MikesGrandStore_description_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "CityMarket":
@ -187,6 +202,16 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = sonanza_description_parser(soup)
elif marketPlace == "Kingdom":
rmm = kingdom_description_parser(soup)
elif marketPlace == "BlackPyramid":
rmm = blackpyramid_description_parser(soup)
elif marketPlace == "Quest":
rmm = quest_description_parser(soup)
elif marketPlace == "CypherMarketplace":
rmm = cyphermarketplace_description_parser(soup)
elif marketPlace == "TheDarkMarket":
rmm = darkmarket_description_parser(soup)
elif marketPlace == "WeTheNorth":
rmm = wethenorth_description_parser(soup)
elif marketPlace == "GoFish":
rmm = gofish_description_parser(soup)
else:
@ -280,6 +305,9 @@ def new_parse(marketPlace, url, createLog):
# Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
listings.sort(key=os.path.getmtime)
for listingIndex, listingFile in enumerate(listings):
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str(
@ -297,12 +325,12 @@ def new_parse(marketPlace, url, createLog):
moveDescriptionError = False
findDescriptionError = False
rw = []
rw = []
if doParseListing:
rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile)
doDescription = rw is not None
if doDescription:
@ -312,12 +340,14 @@ def new_parse(marketPlace, url, createLog):
for rec in rw:
rec = rec.split(',')
descriptionPattern = cleanLink(rec[22]) + ".html"
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
descriptions.sort(key=os.path.getmtime)
nFound += len(descriptions)
for descriptionIndex, descriptionFile in enumerate(descriptions):


+ 11
- 0
MarketPlaces/Utilities/utilities.py View File

@ -13,6 +13,7 @@ from selenium.webdriver.common.by import By
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from PIL import Image
from urllib.parse import urlsplit, urljoin
def generate_aes_key():
@ -439,4 +440,14 @@ def cleanHTML(driver, html):
return clean_html
def get_relative_url(target_url):
# Use a dummy base URL to handle both absolute and relative URLs
base_url = "http://dummybaseurl.com/"
absolute_url = urljoin(base_url, target_url)
# Parse the absolute URL
parsed_absolute_url = urlsplit(absolute_url)
# Extract the path and query from the absolute URL as the relative URL
return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
if parsed_absolute_url.query else parsed_absolute_url.path

Loading…
Cancel
Save