diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 6cc9c60..33259e9 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -45,14 +45,14 @@ def verifyForum(cur, nameForum): print (trace) -def verifyTopic(cur, forumId, authorId, titleTopic): +def verifyTopic(cur, forumId, hrefTopic): try: cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE") - cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", - {'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic}) + cur.execute("select topic_id from topics where forum_id = %(forumId)s and href_topic = %(hrefTopic)s limit 1", + {'forumId': forumId, 'hrefTopic': hrefTopic}) recset = cur.fetchall() @@ -260,7 +260,9 @@ def create_forum(cur, row, url): def create_topic(cur, forumId, row, authorId): - topicId = verifyTopic(cur, forumId, authorId, row[3]) + hrefTopic = get_relative_url(row[6]) + + topicId = verifyTopic(cur, forumId, hrefTopic) if not topicId: topicId = int(getLastTopic(cur) + 1) @@ -275,13 +277,14 @@ def create_topic(cur, forumId, row, authorId): "%s, %s, %s, %s, %s)" recset = [topicId, forumId, authorId, - row[3], row[1], + row[3], + row[1], row[4] if row[4] != '-1' else None, row[5] if row[5] != '-1' else None, - row[6] if row[6] != '-1' else None, + hrefTopic, row[7] if row[7] != '-1' else None, row[8], - row[19]] + row[19] if row[19] != '-1' else None] cur.execute(sql, recset) else: @@ -292,34 +295,50 @@ def create_topic(cur, forumId, row, authorId): recset = cur.fetchall() - if (str(recset[0]['board_topic']) != str(row[1]) or - str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information - str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None)): + if row[19] != '-1' and str(recset[0]['classification_topic']) == str(None): + + sql = "Update topics set classification_topic = %(classification_topic)s where topic_id = %(topicId)s" + cur.execute(sql, {'classification_topic': row[19], + 'topicId': topicId}) + + elif (str(recset[0]['author_id']) != str(authorId) or + str(recset[0]['title_topic']) != str(row[3]) or + str(recset[0]['board_topic']) != str(row[1]) or + str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information + str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None) or + str(recset[0]['classification_topic']) != str(row[19] if row[19] != '-1' else recset[0]['classification_topic'])): - topicVersionId = int(getLastTopicVersion(cur, topicId) + 1) + topicVersionId = int(getLastTopicVersion(cur, topicId) + 1) - sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ + sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, %s)" - recset = [topicId, topicVersionId, forumId, authorId, - recset[0]['title_topic'], - recset[0]['board_topic'], - recset[0]['views_topic'], - recset[0]['posts_topic'], - recset[0]['href_topic'], - recset[0]['dateadded_topic'], - recset[0]['dateinserted_topic'], - recset[0]['classification_topic']] - cur.execute(sql, recset) - - sql = "Update topics set board_topic = %(board_topic)s, views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, " \ - "dateinserted_topic = %(dateinserted_topic)s where topic_id = %(topicId)s" - cur.execute(sql, {'board_topic': row[1], - 'views_topic': row[4] if row[4] != '-1' else None, - 'posts_topic': row[5] if row[5] != '-1' else None, - 'dateinserted_topic': row[8], - 'topicId': topicId}) + recset = [topicId, topicVersionId, forumId, + recset[0]['author_id'], + recset[0]['title_topic'], + recset[0]['board_topic'], + recset[0]['views_topic'], + recset[0]['posts_topic'], + recset[0]['href_topic'], + recset[0]['dateadded_topic'], + recset[0]['dateinserted_topic'], + recset[0]['classification_topic']] + + cur.execute(sql, recset) + + sql = "Update topics set author_id = %(author_id)s, title_topic = %(title_topic)s, board_topic = %(board_topic)s, " \ + "views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, dateinserted_topic = %(dateinserted_topic)s, " \ + "classification_topic = %(classification_topic)s where topic_id = %(topicId)s" + + cur.execute(sql, {'author_id': authorId, + 'title_topic': row[3] if row[3] != '-1' else None, + 'board_topic': row[1] if row[1] != '-1' else None, + 'views_topic': row[4] if row[4] != '-1' else None, + 'posts_topic': row[5] if row[5] != '-1' else None, + 'dateinserted_topic': row[8], + 'classification_topic': row[19] if row[19] != '-1' else None, + 'topicId': topicId}) return topicId @@ -548,21 +567,20 @@ def create_database(cur, con): sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ - "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \ + "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision null, " \ "constraint topics_pk primary key (topic_id), " \ "constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \ "constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) - sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \ - "title_topic ASC NULLS LAST)" + sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, href_topic ASC NULLS LAST)" cur.execute(sql) sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \ "author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \ "views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \ "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ - "classification_topic double precision NOT NULL, " \ + "classification_topic double precision null, " \ "constraint topics_history_pk primary key (topic_id, version_topic), " \ "constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ "constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \ @@ -602,4 +620,5 @@ def create_database(cur, con): if (trace.find("already exists")==-1): print ("There was a problem during the database creation." ) + traceback.print_exc() raise SystemExit \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index f6c0499..ca582c2 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -8,6 +8,8 @@ from psycopg2.extras import RealDictCursor from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * +from Forums.CryptBB.parser import * +from Forums.Incogsnoo.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -116,6 +118,10 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) + elif forum == "CryptBB": + rw = cryptBB_listing_parser(soup) + elif forum == "Incogsnoo": + rw = incogsnoo_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -139,6 +145,10 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) + elif forum == "CryptBB": + rmm = cryptBB_description_parser(soup) + elif forum == "Incogsnoo": + rmm = incogsnoo_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -224,50 +234,6 @@ def new_parse(forum, url, createLog): try: logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") except: - - try: - html = open(line2.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - nError += 1 - print("There was a problem to read the file " + line2 + " in the Description section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n") - continue - - try: - - if forum == "BestCardingWorld": - rmm = bestcardingworld_description_parser(soup) - elif forum == "Cardingleaks": - rmm = cardingleaks_description_parser(soup) - elif forum == "CryptBB": - rmm = cryptBB_description_parser(soup) - elif forum == "OnniForums": - rmm = onniForums_description_parser(soup) - elif forum == "Altenens": - rmm = altenens_description_parser(soup) - elif forum == "Procrax": - rmm = procrax_description_parser(soup) - elif forum == "Libre": - rmm = libre_description_parser(soup) - elif forum == "HiddenAnswers": - rmm = HiddenAnswers_description_parser(soup) - - # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() - key = u"Url:" + os.path.basename(line2).replace(".html", "") - - # check if "page1" exists at the end of a string - # if yes add to first page directory if no add to other - check = re.compile(r'page1$') - if check.search(key): - # print(key, 'is a first page\n') - detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} - else: - # print(key, 'is an other page\n') - other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} print("Could not open log file!") createLog = False logFile = None @@ -277,6 +243,9 @@ def new_parse(forum, url, createLog): # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) + + listings.sort(key=os.path.getmtime) + for listingIndex, listingFile in enumerate(listings): print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( @@ -304,44 +273,6 @@ def new_parse(forum, url, createLog): if doDescription: - if not readError: - - parseError = False - try: - - if forum == "BestCardingWorld": - rw = bestcardingworld_listing_parser(soup) - elif forum == "Cardingleaks": - rw = cardingleaks_listing_parser(soup) - elif forum == "CryptBB": - rw = cryptBB_listing_parser(soup) - elif forum == "OnniForums": - rw = onniForums_listing_parser(soup) - elif forum == "Altenens": - rw = altenens_listing_parser(soup) - elif forum == "Procrax": - rw = procrax_listing_parser(soup) - elif forum == "Libre": - rw = libre_listing_parser(soup) - elif forum == "HiddenAnswers": - rw = HiddenAnswers_listing_parser(soup) - - except: - - nError += 1 - print("There was a problem to read the file " + line1 + " in the listing section!") - traceback.print_exc() - if createLog: - logFile.write( - str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") - parseError = True - - if not parseError: - - persistError = False - moveError = False - num_in_db = 0 - num_persisted_moved = 0 nFound = 0 for rec in rw: @@ -353,8 +284,13 @@ def new_parse(forum, url, createLog): # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) + descriptions.sort(key=os.path.getmtime) + nFound += len(descriptions) + # Aggregate of posts from multiple description (topic) pages + posts = [] + for descriptionIndex, descriptionFile in enumerate(descriptions): print("Reading description folder of '" + forum + "', file '" + os.path.basename( @@ -384,8 +320,13 @@ def new_parse(forum, url, createLog): # Combining the information from Listing and Description Pages rec = mergePages(rmm, rec) - # Append to the list the classification of the topic - rec.append(str(predict(rec[3], getPosts(rec[15]), language='sup_english'))) + # Add the page's posts to aggregate + posts += rec[15] + + # Classify on final description page + if descriptionIndex == len(descriptions) - 1: + # classification for topic based on all posts from all pages + rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english')) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 2a5e2f0..a8f34ea 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -15,6 +15,7 @@ from selenium.webdriver.common.by import By from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad from PIL import Image +from urllib.parse import urlsplit, urljoin def generate_aes_key(): @@ -205,21 +206,21 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, lne = forum # 0 lne += "," - lne += board # 1 + lne += board # 1 board_topic lne += "," lne += author[n] # 2 lne += "," - lne += topic[n] # 3 + lne += topic[n] # 3 topic_title lne += "," - lne += "-1" if len(views) == 0 else views[n] # 4 + lne += "-1" if len(views) == 0 else views[n] # 4 views_topic lne += "," - lne += "-1" if len(posts) == 0 else posts[n] # 5 + lne += "-1" if len(posts) == 0 else posts[n] # 5 posts_topic lne += "," - lne += "-1" if len(href) == 0 else href[n] # 6 + lne += "-1" if len(href) == 0 else href[n] # 6 href_topic lne += "," - lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 + lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 dateadded_topic lne += "," - lne += day + " " + ahora # 8 + lne += day + " " + ahora # 8 dateinserted_topic lne += "," lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user lne += "," @@ -240,6 +241,8 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, lne += "-1" # 17 dateadded_post lne += "," lne += "-1" # 18 image_post + lne += "," + lne += "-1" # 19 classification_post rw.append(lne) @@ -427,11 +430,14 @@ def cleanHTML(driver, html): return clean_html +def get_relative_url(target_url): + # Use a dummy base URL to handle both absolute and relative URLs + base_url = "http://dummybaseurl.com/" + absolute_url = urljoin(base_url, target_url) + # Parse the absolute URL + parsed_absolute_url = urlsplit(absolute_url) - - - - - - + # Extract the path and query from the absolute URL as the relative URL + return parsed_absolute_url.path + '?' + parsed_absolute_url.query \ + if parsed_absolute_url.query else parsed_absolute_url.path diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 7d85bb5..403d59a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -67,14 +67,14 @@ def verifyVendor(cur, nameVendor, marketId): trace = traceback.format_exc() print (trace) -def verifyItem(cur, marketId, vendorId, nameItem): +def verifyItem(cur, marketId, hrefItem): try: cur.execute("lock table items IN ACCESS EXCLUSIVE MODE") - cur.execute("select item_id from items where market_id = %(marketId)s and vendor_id = %(vendorId)s and name_item = %(nameItem)s" - "limit 1", {'marketId': marketId, 'vendorId': vendorId, 'nameItem': nameItem}) + cur.execute("select item_id from items where market_id = %(marketId)s and href_item = %(hrefItem)s limit 1", + {'marketId': marketId, 'hrefItem': hrefItem}) recset = cur.fetchall() @@ -263,7 +263,9 @@ def create_vendor(cur, row, marketId): def create_items(cur, row, marketId, vendorId): - itemId = verifyItem(cur, marketId, vendorId, row[4]) + hrefItem = get_relative_url(row[22]) + + itemId = verifyItem(cur, marketId, hrefItem) if not itemId: itemId = int(getLastItem(cur) + 1) @@ -277,7 +279,7 @@ def create_items(cur, row, marketId, vendorId): sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ - "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ + "quantityleft_item, shippedfrom_item, shippedto_item, image_item, href_item, lastseen_item, dateinserted_item, " \ "classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s)" @@ -298,11 +300,11 @@ def create_items(cur, row, marketId, vendorId): row[17] if row[17] != '-1' else None, row[18] if row[18] != '-1' else None, row[19] if row[19] != '-1' else None, - row[23], row[20] if row[20] != '-1' else None, - row[22] if row[22] != '-1' else None, + hrefItem, + row[23], row[23], - row[24]] + row[24] if row[24] != '-1' else None] cur.execute(sql, recset) @@ -316,7 +318,9 @@ def create_items(cur, row, marketId, vendorId): # decode_decrypt_image_in_base64(recset[0]['image_item']) - if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or + if (str(recset[0]['vendor_id']) != str(vendorId) or + str(recset[0]['name_item']) != str(row[4] if row[4] != '-1' else None) or + str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or str(recset[0]['ms_item']) != str(row[7] if row[7] != '-1' else None) or str(recset[0]['category_item']) != str(row[8] if row[8] != '-1' else None) or @@ -331,7 +335,8 @@ def create_items(cur, row, marketId, vendorId): str(recset[0]['quantityleft_item']) != str(row[17] if row[17] != '-1' else None) or str(recset[0]['shippedfrom_item']) != str(row[18] if row[18] != '-1' else None) or str(recset[0]['shippedto_item']) != str(row[19] if row[19] != '-1' else None) or - str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None)): + str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None) or + str(recset[0]['classification_item']) != str(row[24] if row[24] != '-1' else None)): itemVersionId = int(getLastItemVersion(cur, itemId) + 1) @@ -341,7 +346,8 @@ def create_items(cur, row, marketId, vendorId): "classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s)" - recset = [itemId, itemVersionId, marketId, vendorId, + recset = [itemId, itemVersionId, marketId, + recset[0]['vendor_id'], recset[0]['name_item'], recset[0]['description_item'], recset[0]['cve_item'], @@ -358,22 +364,26 @@ def create_items(cur, row, marketId, vendorId): recset[0]['quantityleft_item'], recset[0]['shippedfrom_item'], recset[0]['shippedto_item'], - recset[0]['lastseen_item'], recset[0]['image_item'], recset[0]['href_item'], + recset[0]['lastseen_item'], recset[0]['dateinserted_item'], recset[0]['classification_item']] cur.execute(sql, recset) - sql = "Update items set description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \ + sql = "Update items set vendor_id = %(vendor_id)s, name_item = %(name_item)s, " \ + "description_item = %(description_item)s, cve_item = %(cve_item)s, ms_item = %(ms_item)s, " \ "category_item = %(category_item)s, views_item = %(views_item)s, reviews_item = %(reviews_item)s, " \ "rating_item = %(rating_item)s, dateadded_item = %(dateadded_item)s, btc_item = %(btc_item)s, " \ "usd_item = %(usd_item)s, euro_item = %(euro_item)s, quantitysold_item = %(quantitysold_item)s, " \ "quantityleft_item = %(quantityleft_item)s, shippedfrom_item = %(shippedfrom_item)s, shippedto_item = %(shippedto_item)s, " \ - "lastseen_item = %(lastseen_item)s, image_item = %(image_item)s, dateinserted_item = %(dateinserted_item)s where item_id = %(itemId)s" + "image_item = %(image_item)s, lastseen_item = %(lastseen_item)s, dateinserted_item = %(dateinserted_item)s." \ + "classification_item = %(classification_item)s where item_id = %(itemId)s" - cur.execute(sql, {'description_item': row[5] if row[5] != '-1' else None, + cur.execute(sql, {'vendor_id': vendorId, + 'name_item': row[4] if row[4] != '-1' else None, + 'description_item': row[5] if row[5] != '-1' else None, 'cve_item': row[6] if row[6] != '-1' else None, 'ms_item': row[7] if row[7] != '-1' else None, 'category_item': row[8] if row[8] != '-1' else None, @@ -388,12 +398,12 @@ def create_items(cur, row, marketId, vendorId): 'quantityleft_item': row[17] if row[17] != '-1' else None, 'shippedfrom_item': row[18] if row[18] != '-1' else None, 'shippedto_item': row[19] if row[19] != '-1' else None, - 'dateinserted_item': row[23], - 'lastseen_item': row[23], 'image_item': row[20] if row[20] != '-1' else None, + 'lastseen_item': row[23], + 'dateinserted_item': row[23], + 'classification_item': row[24] if row[24] != '-1' else None, 'itemId': itemId}) - else: #updating when was the last time the crawler saw that item sql = "Update items set lastseen_item = %(lastseen_item)s where item_id = %(itemId)s" @@ -438,15 +448,15 @@ def create_database(cur, con): sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \ - "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ - "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + "CONSTRAINT marketplaces_status_pk PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_status_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" cur.execute(sql) sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ - "constraint vendors_pk primary key (vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \ - "market_id))" + "constraint vendors_pk primary key (vendor_id), " \ + "constraint vendors_market_id_fk foreign key (market_id) references marketplaces (market_id))" cur.execute(sql) sql = "create unique index unique_vendor ON vendors USING btree (market_id ASC NULLS LAST, name_vendor ASC NULLS LAST)" @@ -467,14 +477,14 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \ - "href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \ - "classification_item double precision not null, constraint items_pk primary key (item_id), constraint " \ + "varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \ + "lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \ + "classification_item double precision null, constraint items_pk primary key (item_id), constraint " \ "items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \ "items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id))" cur.execute(sql) - sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, vendor_id ASC NULLS LAST, name_item ASC NULLS LAST)" + sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, href_item ASC NULLS LAST)" cur.execute(sql) sql = "create table items_history(item_id integer not null, version_item integer not null, market_id integer not null, " \ @@ -484,9 +494,9 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \ - "href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \ - "classification_item double precision not null, constraint items_history_pk primary key (item_id, version_item), " \ + "varying(255) null, image_item character varying(10000000) null, href_item character varying(255) not null, " \ + "lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \ + "classification_item double precision null, constraint items_history_pk primary key (item_id, version_item), " \ "constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ "constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \ "constraint items_history_item_id_fkey foreign key (item_id) references items (item_id))" @@ -502,4 +512,5 @@ def create_database(cur, con): if (trace.find("already exists")==-1): print ("There was a problem during the database creation." ) + traceback.print_exc() raise SystemExit \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 4e3ab4a..1901f76 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -10,6 +10,7 @@ from psycopg2.extras import RealDictCursor from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DarkFox.parser import * from MarketPlaces.AnonymousMarketplace.parser import * +from MarketPlaces.TheDarkMarket.parser import * from MarketPlaces.ViceCity.parser import * from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * @@ -18,7 +19,11 @@ from MarketPlaces.CityMarket.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.Sonanza.parser import * from MarketPlaces.Kingdom.parser import * +from MarketPlaces.BlackPyramid.parser import * +from MarketPlaces.Quest.parser import * from MarketPlaces.Ares.parser import * +from MarketPlaces.CypherMarketplace.parser import * +from MarketPlaces.WeTheNorth.parser import * from MarketPlaces.GoFish.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -130,7 +135,7 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): elif marketPlace == "M00nkeyMarket": rw = m00nkey_listing_parser(soup) elif marketPlace == "MikesGrandStore": - rw = mikesGrandStore_listing_parser(soup) + rw = MikesGrandStore_listing_parser(soup) elif marketPlace == "PabloEscobarMarket": rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": @@ -143,6 +148,16 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = sonanza_listing_parser(soup) elif marketPlace == "Kingdom": rw = kingdom_listing_parser(soup) + elif marketPlace == "BlackPyramid": + rw = blackpyramid_listing_parser(soup) + elif marketPlace == "Quest": + rw = quest_listing_parser(soup) + elif marketPlace == "CypherMarketplace": + rw = cyphermarketplace_listing_parser(soup) + elif marketPlace == "TheDarkMarket": + rw = darkmarket_listing_parser(soup) + elif marketPlace == "WeTheNorth": + rw = wethenorth_listing_parser(soup) elif marketPlace == "GoFish": rw = gofish_listing_parser(soup) else: @@ -174,7 +189,7 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): elif marketPlace == "M00nkeyMarket": rmm = m00nkey_description_parser(soup) elif marketPlace == "MikesGrandStore": - rmm = mikesGrandStore_description_parser(soup) + rmm = MikesGrandStore_description_parser(soup) elif marketPlace == "PabloEscobarMarket": rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": @@ -187,6 +202,16 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = sonanza_description_parser(soup) elif marketPlace == "Kingdom": rmm = kingdom_description_parser(soup) + elif marketPlace == "BlackPyramid": + rmm = blackpyramid_description_parser(soup) + elif marketPlace == "Quest": + rmm = quest_description_parser(soup) + elif marketPlace == "CypherMarketplace": + rmm = cyphermarketplace_description_parser(soup) + elif marketPlace == "TheDarkMarket": + rmm = darkmarket_description_parser(soup) + elif marketPlace == "WeTheNorth": + rmm = wethenorth_description_parser(soup) elif marketPlace == "GoFish": rmm = gofish_description_parser(soup) else: @@ -280,6 +305,9 @@ def new_parse(marketPlace, url, createLog): # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) + + listings.sort(key=os.path.getmtime) + for listingIndex, listingFile in enumerate(listings): print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str( @@ -297,12 +325,12 @@ def new_parse(marketPlace, url, createLog): moveDescriptionError = False findDescriptionError = False - rw = [] + rw = [] if doParseListing: rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile) - + doDescription = rw is not None if doDescription: @@ -312,12 +340,14 @@ def new_parse(marketPlace, url, createLog): for rec in rw: rec = rec.split(',') - + descriptionPattern = cleanLink(rec[22]) + ".html" # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) + descriptions.sort(key=os.path.getmtime) + nFound += len(descriptions) for descriptionIndex, descriptionFile in enumerate(descriptions): diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index c6aa192..9a04d1d 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -13,6 +13,7 @@ from selenium.webdriver.common.by import By from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad from PIL import Image +from urllib.parse import urlsplit, urljoin def generate_aes_key(): @@ -439,4 +440,14 @@ def cleanHTML(driver, html): return clean_html +def get_relative_url(target_url): + # Use a dummy base URL to handle both absolute and relative URLs + base_url = "http://dummybaseurl.com/" + absolute_url = urljoin(base_url, target_url) + # Parse the absolute URL + parsed_absolute_url = urlsplit(absolute_url) + + # Extract the path and query from the absolute URL as the relative URL + return parsed_absolute_url.path + '?' + parsed_absolute_url.query \ + if parsed_absolute_url.query else parsed_absolute_url.path