diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 33259e9..1fc9d6a 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -4,6 +4,7 @@ import psycopg2 import traceback from Forums.Utilities.utilities import * from dateutil.relativedelta import relativedelta, FR +from scipy.spatial import distance def connectDataBase(): @@ -113,6 +114,28 @@ def verifyUser(cur, nameUser, forumId): print (trace) +def verifyImage(cur, base64Image): + + try: + + cur.execute("lock table market_images IN ACCESS EXCLUSIVE MODE") + + cur.execute("select image_id from market_images where hash_image = %(hashImage)s limit 1", + {'hashImage': generate_image_hash(base64Image)}) + + recset = cur.fetchall() + + if recset: + return recset[0]['image_id'] + else: + return 0 + + except: + + trace = traceback.format_exc() + print (trace) + + def getLastForum(cur): try: @@ -169,6 +192,7 @@ def getLastUser(cur): trace = traceback.format_exc() print (trace) + def getLastUserVersion(cur, userId): try: @@ -187,6 +211,7 @@ def getLastUserVersion(cur, userId): trace = traceback.format_exc() print (trace) + def getLastTopicVersion(cur, topicId): try: @@ -205,6 +230,7 @@ def getLastTopicVersion(cur, topicId): trace = traceback.format_exc() print (trace) + def getLastPostVersion(cur, postId): try: @@ -223,6 +249,7 @@ def getLastPostVersion(cur, postId): trace = traceback.format_exc() print (trace) + def getLastPost(cur): try: @@ -241,6 +268,26 @@ def getLastPost(cur): trace = traceback.format_exc() print (trace) + +def getLastImage(cur): + + try: + + cur.execute("select image_id from market_images order by image_id desc limit 1") + + recset = cur.fetchall() + + if recset: + return recset[0]['image_id'] + else: + return 0 + + except: + + trace = traceback.format_exc() + print (trace) + + def create_forum(cur, row, url): forumId = verifyForum(cur, row[0]) @@ -348,8 +395,12 @@ def create_author(cur, row, forumId): userId = verifyUser(cur, row[2], forumId) if not userId: - userId = int(getLastUser(cur) + 1) + newUser = True + else: + newUser = False + + if newUser: sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" @@ -375,19 +426,21 @@ def create_user(cur, row, forumId, index): if newUser: - sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ - "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" + imageId = create_image(cur, row[9][index], row[8]) - recset = [userId, forumId, - row[10][index], - row[11][index] if row[11][index] != '-1' else None, - row[12][index] if row[12][index] != '-1' else None, - row[13][index] if row[13][index] != '-1' else None, - row[14][index] if row[14][index] != '-1' else None, - row[9][index] if row[9][index] != '-1' else None, - row[8]] + sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ + "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" - cur.execute(sql, recset) + recset = [userId, forumId, + row[10][index], + row[11][index] if row[11][index] != '-1' else None, + row[12][index] if row[12][index] != '-1' else None, + row[13][index] if row[13][index] != '-1' else None, + row[14][index] if row[14][index] != '-1' else None, + imageId, + row[8]] + + cur.execute(sql, recset) else: @@ -397,13 +450,15 @@ def create_user(cur, row, forumId, index): recset = cur.fetchall() - #decode_decrypt_image_in_base64(recset[0]['image_user']) + imageId = recset[0]['image_user'] + if not is_same_image(row[9][index], recset[0]['image_user'])): + imageId = create_image(cur, row[9][index], row[8]) if (str(recset[0]['status_user']) != str(row[11][index] if row[11][index] != '-1' else None) or str(recset[0]['reputation_user']) != str(row[12][index] if row[12][index] != '-1' else None) or str(recset[0]['interest_user']) != str(row[13][index] if row[13][index] != '-1' else None) or str(recset[0]['signature_user']) != str(row[14][index] if row[14][index] != '-1' else None) or - str(recset[0]['image_user']) != str(row[9][index] if row[9][index] != '-1' else None)): # there was a change in the user information + str(recset[0]['image_user']) != str(imageId)): # there was a change in the user information if (str(recset[0]['status_user']) != 'Nan' or str(recset[0]['reputation_user']) != 'Nan' or @@ -434,8 +489,8 @@ def create_user(cur, row, forumId, index): 'reputation_user': row[12][index] if row[12][index] != '-1' else None, 'interest_user': row[13][index] if row[13][index] != '-1' else None, 'signature_user': row[14][index] if row[14][index] != '-1' else None, - 'image_user': row[9][index] if row[9][index] != '-1' else None, - 'dateinserted_user': row[8] if row[8] != '-1' else None, + 'image_user': imageId, + 'dateinserted_user': row[8], 'userId': userId}) return userId @@ -452,8 +507,14 @@ def create_posts(cur, row, forumId, topicId): postId = verifyPost(cur, topicId, userId, row[17][i]) if not postId: - postId = int(getLastPost(cur) + 1) + newPost = True + else: + newPost = False + + if newPost: + + imageId = create_image(cur, row[18][i], row[8]) sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, image_post," \ "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)" @@ -461,8 +522,8 @@ def create_posts(cur, row, forumId, topicId): recset = [postId, topicId, userId, row[15][i] if row[15][i] != '-1' else None, row[16][i] if row[16][i] != '-1' else None, - row[18][i] if row[18][i] != '-1' else None, - row[17][i] if row[17][i] != '-1' else None, + imageId, + row[17][i], row[8]] cur.execute(sql, recset) @@ -475,11 +536,13 @@ def create_posts(cur, row, forumId, topicId): recset = cur.fetchall() + imageId = recset[0]['image_post'] + if not is_same_image(row[18][i], recset[0]['image_post'])): + imageId = create_image(cur, row[18][i], row[8]) + if (str(recset[0]['content_post']) != str(row[15][i]) or str(recset[0]['feedback_post']) != str(row[16][i] if row[16][i] != '-1' else None) or - str(recset[0]['image_post']) != str(row[18][i] if row[18][i] != '-1' else None)): # there was a change in the post information - - #decode_decrypt_image_in_base64(recset[0]['image_post']) + str(recset[0]['image_post']) != str(imageId)): # there was a change in the post information postVersionId = int(getLastPostVersion(cur, postId) + 1) @@ -499,10 +562,78 @@ def create_posts(cur, row, forumId, topicId): "image_post = %(image_post)s, dateinserted_post = %(dateinserted_post)s where post_id = %(postId)s" cur.execute(sql, {'content_post': row[15][i] if row[15][i] != '-1' else None, 'feedback_post': row[16][i] if row[16][i] != '-1' else None, - 'image_post': row[18][i] if row[18][i] != '-1' else None, + 'image_post': imageId, 'dateinserted_post': row[8], 'postId': postId}) +def is_same_image(newBase64Image, oldImageId): + + if newBase64Image == "-1" and oldImageId == "-1": + return True + + if newBase64Image == "-1": # and oldImageId != "-1" + return False + + if oldImageId == "-1": # and newBase64Image != "-1" + return False + + sql = "select * from market_images where image_id = %(imageId)s" + cur.execute(sql, {'imageId': oldImageId}) + + recset = cur.fetchall() + + decImage = decode_decrypt_image_in_base64(base64Image) + + hash1 = generate_image_hash(decImage) + hash2 = recset[0]['hash_image'] + + if hash1 == hash2: + return True + + features1 = json.loads(extract_hidden_layer_output(dec_string)) + features2 = json.loads(recset[0]['resnet50_image']) + + similarity = 1 - distance.cosine(features1, features2) + + return similarity < 0.8 + +def create_image(cur, base64Image, date): + + if base64Image == "-1": + return None + + imageId = verifyImage(base64Image) + + if not imageId: + imageId = int(getLastImage(cur) + 1) + newImage = True + else: + newImage = False + + if newImage: + + decImage = decode_decrypt_image_in_base64(base64Image) + + hashImage = generate_image_hash(decImage) + hloFeatures = extract_hidden_layer_output(decImage) + keypoints, descriptors = extract_keypoints(decImage) + + sql = "Insert into market_images(image_id, hash_image, base64_image, " \ + "resnet50_image, siftkey_image, siftdesc_image, dateinserted_image) " \ + "Values (%s, %s, %s, %s, %s, %s, %s)" + + recset = [imageId, + hashImage, + base64Image, + hloFeatures, + keypoints, + descriptors, + date] + + cur.execute(sql, recset) + + return imageId + def create_status(cur, forumId, date, listings, descriptions, status): date = datetime.strptime(date, "%m%d%Y") @@ -529,6 +660,15 @@ def create_database(cur, con): try: + sql = "create table forum_images(image_id integer not null, hash_image character varying(64) not null, base64_image character varying(10000000) not null, " \ + "resnet50_image character varying(1000000) null, siftkey_image character varying(1000000) null, siftdesc_image character varying(1000000) null, " \ + "dateinserted_item timestamp(6) with time zone not null, " \ + "constraint items_pk primary key (image_id))" + cur.execute(sql) + + sql = "create unique index unique_image ON forum_images USING btree (hash_image ASC NULLS LAST)" + cur.execute(sql) + sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \ "character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, " \ "constraint forums_pk primary key (forum_id))" @@ -546,9 +686,10 @@ def create_database(cur, con): sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ - "image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ + "image_user integer null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_pk primary key (user_id), " \ - "constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id))" + "constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id), " \ + "constraint users_image_id_fk foreign key (image_user) references forum_images (image_id))" cur.execute(sql) sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" @@ -557,11 +698,12 @@ def create_database(cur, con): sql = "create table users_history(user_id integer NOT NULL, version_user integer not null, forum_id integer NOT NULL, " \ "name_user character varying(255) NOT NULL, status_user character varying(255) null, " \ "reputation_user character varying(255) null, interest_user character varying(5000) null, " \ - "signature_user character varying(1000) null, image_user character varying(10000000) null, " \ + "signature_user character varying(1000) null, image_user integer null, " \ "dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_history_pk primary key (user_id, version_user), " \ "constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \ - "constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id))" + "constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id), " \ + "constraint users_history_image_id_fk foreign key (image_user) references forum_images (image_id))" cur.execute(sql) sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ @@ -589,11 +731,12 @@ def create_database(cur, con): sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "image_post integer null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_pk primary key (post_id), " \ "constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \ - "constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id))" + "constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint posts_image_id_fk foreign key (image_post) references forum_images (image_id))" cur.execute(sql) sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \ @@ -602,12 +745,13 @@ def create_database(cur, con): sql = "create table posts_history(post_id integer NOT NULL, version_post integer not null, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "image_post integer null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_history_pk primary key (post_id, version_post), " \ "constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \ "constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ - "constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id))" + "constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id), " \ + "constraint posts_history_image_id_fk foreign key (image_post) references forum_images (image_id))" cur.execute(sql) con.commit() @@ -621,4 +765,4 @@ def create_database(cur, con): if (trace.find("already exists")==-1): print ("There was a problem during the database creation." ) traceback.print_exc() - raise SystemExit \ No newline at end of file + raise SystemExit diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index a8f34ea..2b13602 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -7,9 +7,14 @@ import hashlib import base64 import io import configparser +import json +import keras +import cv2 +import numpy as np +from keras.preprocessing import image +from keras.applications.imagenet_utils import preprocess_input +from keras.models import Model from datetime import datetime, timedelta -import datetime as fulldatetime -from bs4 import BeautifulSoup from lxml import html as lxml from selenium.webdriver.common.by import By from Crypto.Cipher import AES @@ -40,6 +45,89 @@ aes_key = generate_aes_key() encryptCipher = AES.new(aes_key, AES.MODE_ECB) decryptCipher = AES.new(aes_key, AES.MODE_ECB) +model = keras.applications.ResNet50(weights='imagenet', include_top=True) +feat_extractor = Model(inputs=model.input, outputs=model.get_layer('avg_pool').output) + +sift = cv2.SIFT_create( + nfeatures=0, # Number of features, 0 for unlimited + nOctaveLayers=3, # Number of layers per octave + contrastThreshold=0.09, # Contrast threshold + edgeThreshold=10, # Edge threshold + sigma=1.6 # Initial Gaussian blur sigma +) + + +def generate_image_hash(image_string): + + image_bytes = bytes(image_string, encoding='utf-8') + image_bytes = base64.b64decode(image_bytes) + + return hashlib.sha256(image_bytes).hexdigest() + + +def extract_hidden_layer_output(image_string): + + image_bytes = bytes(image_string, encoding='utf-8') + image_bytes = base64.b64decode(image_bytes) + im = Image.open(io.BytesIO(image_bytes)).convert('RGB') + + x = image.img_to_array(im) + x = image.smart_resize(x, size=model.input_shape[1:3], interpolation='nearest') + x = np.expand_dims(x, axis=0) + x = preprocess_input(x) + + return json.dumps(feat_extractor.predict(x)[0].tolist()) + + +def extract_keypoints(image_string): + + image_bytes = bytes(image_string, encoding='utf-8') + image_bytes = base64.b64decode(image_bytes) + image_array = np.asarray(bytearray(image_bytes), dtype=np.uint8) + + img = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE) + + keypoints, descriptors = sift.detectAndCompute(img, None) + + return json.dumps(wrap_keypoints(keypoints)), json.dumps(descriptors.tolist()) + + +def wrap_keypoints(keypoints): + + keypoints_list = [] + + for i in range(len(keypoints)): + temp = { + 'pt': keypoints[i].pt, + 'size': keypoints[i].size, + 'angle': keypoints[i].angle, + 'octave': keypoints[i].octave, + 'response': keypoints[i].response, + 'class_id': keypoints[i].class_id + } + keypoints_list.append(temp) + + return keypoints_list + + +def unwrap_keypoints(keypoints_list): + + keypoints = [] + + for temp in keypoints_list: + point = cv2.KeyPoint( + x=temp['pt'][0], + y=temp['pt'][1], + size=temp['size'], + angle=temp['angle'], + octave=temp['octave'], + response=temp['response'], + class_id=temp['class_id'] + ) + keypoints.append(point) + + return tuple(keypoints) + def cleanText(originalText): @@ -53,138 +141,6 @@ def cleanText(originalText): return originalText -def convertDate(sdate, language, crawlerDate): - - if language == "english": - - todaysday = crawlerDate.strftime("%m/%d/%Y") - - sdate = sdate.replace(u"January","01") - sdate = sdate.replace(u"February","02") - sdate = sdate.replace(u"March","03") - sdate = sdate.replace(u"April","04") - sdate = sdate.replace(u"May","05") - sdate = sdate.replace(u"June","06") - sdate = sdate.replace(u"July","07") - sdate = sdate.replace(u"August","08") - sdate = sdate.replace(u"September","09") - sdate = sdate.replace(u"October","10") - sdate = sdate.replace(u"November","11") - sdate = sdate.replace(u"December","12") - sdate = sdate.replace(u"Jan","01") - sdate = sdate.replace(u"Feb","02") - sdate = sdate.replace(u"Mar","03") - sdate = sdate.replace(u"Apr","04") - sdate = sdate.replace(u"May","05") - sdate = sdate.replace(u"Jun","06") - sdate = sdate.replace(u"Jul","07") - sdate = sdate.replace(u"Aug","08") - sdate = sdate.replace(u"Sep","09") - sdate = sdate.replace(u"Oct","10") - sdate = sdate.replace(u"Nov","11") - sdate = sdate.replace(u"Dec","12") - sdate = sdate.replace(u".","") - - if sdate == "Today at": - sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y') - - sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y') - - elif language == "french": - - todaysday = crawlerDate.strftime("%m/%d/%Y") - - sdate = sdate.replace(u"janvier","01") - sdate = sdate.replace(u"jan","01") - sdate = sdate.replace(u"février","02") - sdate = sdate.replace(u"juin","06") - sdate = sdate.replace(u"juillet","07") - sdate = sdate.replace(u"juil","07") - sdate = sdate.replace(u"août","08") - sdate = sdate.replace(u"septembre","09") - sdate = sdate.replace(u"sept","09") - sdate = sdate.replace(u"octobre","10") - sdate = sdate.replace(u"oct","10") - sdate = sdate.replace(u"novembre","11") - sdate = sdate.replace(u"nov","11") - sdate = sdate.replace(u"décembre","12") - sdate = sdate.replace(u"déc","12") - sdate = sdate.replace(u".","") - - if sdate == u"Aujourd'hui": - sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y') - - if "mar" in sdate: - print ("Add March to the IBM Black Market") - raise SystemExit - elif "avr" in sdate: - print ("Add April to the IBM Black Market") - raise SystemExit - elif "mai" in sdate: - print ("Add May to the IBM Black Market") - raise SystemExit - - sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') - - elif language == "swedish": - - sdate = sdate.replace(u"jan","01") - sdate = sdate.replace(u"feb","02") - sdate = sdate.replace(u"mar","03") - sdate = sdate.replace(u"apr","04") - sdate = sdate.replace(u"maj","05") - sdate = sdate.replace(u"jun","06") - sdate = sdate.replace(u"jul","07") - sdate = sdate.replace(u"aug","08") - sdate = sdate.replace(u"sep","09") - sdate = sdate.replace(u"okt","10") - sdate = sdate.replace(u"nov","11") - sdate = sdate.replace(u"dec","12") - sdate = sdate.replace(u".","") - - sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') - - elif language == "russian": - - if sdate == u'\u0412\u0447\u0435\u0440\u0430': - sdate = crawlerDate.today() - timedelta(1) - sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') - elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate: - return "" - - sdate = sdate.replace(u"января","01") - sdate = sdate.replace(u"янв","01") - sdate = sdate.replace(u"февраля","02") - sdate = sdate.replace(u"Февраль", "02") - sdate = sdate.replace(u"фев","02") - sdate = sdate.replace(u"марта","03") - sdate = sdate.replace(u"апреля","04") - sdate = sdate.replace(u"апр","04") - sdate = sdate.replace(u"мар","05") - sdate = sdate.replace(u"май","05") - sdate = sdate.replace(u"мая","05") - sdate = sdate.replace(u"июня","06") - sdate = sdate.replace(u"июн","06") - sdate = sdate.replace(u"июля","07") - sdate = sdate.replace(u"июл","07") - sdate = sdate.replace(u"августа","08") - sdate = sdate.replace(u"авг","08") - sdate = sdate.replace(u"сентября","09") - sdate = sdate.replace(u"сен","09") - sdate = sdate.replace(u"октября","10") - sdate = sdate.replace(u"Октябрь","10") - sdate = sdate.replace(u"окт","10") - sdate = sdate.replace(u"ноября","11") - sdate = sdate.replace(u"ноя","11") - sdate = sdate.replace(u"декабря","12") - sdate = sdate.replace(u"дек","12") - sdate = sdate.replace(u".","") - - sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') - - return sdate - - def cleanLink(originalLink): safe_chars = string.ascii_letters + string.digits @@ -256,61 +212,12 @@ def cleanString(originalString): updated_string = updated_string.replace("\r", "") #replace all carriage returns updated_string = updated_string.replace("'", "^") #replace all semicolons updated_string = updated_string.replace(u"»", '') #replace all arrows - updated_string = updated_string.replace("!", "") - updated_string = updated_string.replace(";", "") #replace all exclamations + updated_string = updated_string.replace("!", "") #replace all exclamation points + updated_string = updated_string.replace(";", "") #replace all exclamations return updated_string -#function to convert long informal date string to formal date -def convertFromLongDate(longDate, crawlerdate): - list_of_words = [] - list_of_words = longDate.split() - - day = 0 - week = 0 - hour = 0 - second = 0 - minute = 0 - year = 0 - total_days = 0 - - if 'days' in list_of_words: - index = list_of_words.index('days') - day = float(list_of_words[index - 1]) - - if 'weeks' in list_of_words: - index = list_of_words.index('weeks') - week = float(list_of_words[index - 1]) - - if 'hours' in list_of_words: - index = list_of_words.index('hours') - hour = float(list_of_words[index - 1]) - - if 'seconds' in list_of_words: - index = list_of_words.index('seconds') - second = float(list_of_words[index - 1]) - - if 'minutes' in list_of_words: - index = list_of_words.index('minutes') - minute = float(list_of_words[index - 1]) - - if 'years' in list_of_words: - index = list_of_words.index('years') - year = float(list_of_words[index - 1]) - - if year != 0: - total_days = day + 365 * year - - #today = datetime.date.today() - timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute) - - date = crawlerdate - timeDelta - correct_date = str(date.strftime('%m/%d/%Y')) - - return correct_date - - def cleanNumbers(inputString): reg_ex = re.compile(r'[^\d.]+') @@ -319,20 +226,16 @@ def cleanNumbers(inputString): return updated_string -def aes_encryption(item): +def aes_encryption(data_bytes): - to_bytes = bytes(item) - - encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) + encrypted_bytes = encryptCipher.encrypt(pad(data_bytes, BLOCK_SIZE)) return encrypted_bytes -def aes_decryption(item): - - to_bytes = bytes(item) +def aes_decryption(data_bytes): - decrypted_bytes = decryptCipher.decrypt(to_bytes) + decrypted_bytes = decryptCipher.decrypt(data_bytes) return unpad(decrypted_bytes, BLOCK_SIZE) @@ -346,28 +249,28 @@ def encrypt_encode_image_to_base64(driver, xpath): encrypted_image = aes_encryption(image_data) base64_image = base64.b64encode(encrypted_image) - string_image = base64_image.decode('utf-8') + enc_image_string = base64_image.decode('utf-8') - return string_image + return enc_image_string - except: + except Exception as e: + print(e) pass return None -def decode_decrypt_image_in_base64(string_image): +def decode_decrypt_image_in_base64(image_string): try: - base64_image = bytes(string_image, encoding='utf-8') - encrypted_image = base64.b64decode(base64_image) - decrypted_image = aes_decryption(encrypted_image) - - im = Image.open(io.BytesIO(decrypted_image)) - im.show() + image_bytes = bytes(image_string, encoding='utf-8') + encrypted_bytes = base64.b64decode(image_bytes) + decrypted_image = aes_decryption(encrypted_bytes) + base64_image = base64.b64encode(decrypted_image) + dec_image_string = base64_image.decode('utf-8') - return decrypted_image + return dec_image_string except Exception as e: print(e) diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index ab8eb6c..d601f8d 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -2,7 +2,6 @@ __author__ = 'DarkWeb' import psycopg2 import traceback -import configparser from MarketPlaces.Utilities.utilities import * from dateutil.relativedelta import relativedelta, FR from scipy.spatial import distance @@ -202,6 +201,26 @@ def getLastItemVersion(cur, itemId): trace = traceback.format_exc() print(trace) + +def getLastImage(cur): + + try: + + cur.execute("select image_id from market_images order by image_id desc limit 1") + + recset = cur.fetchall() + + if recset: + return recset[0]['image_id'] + else: + return 0 + + except: + + trace = traceback.format_exc() + print (trace) + + def create_marketPlace(cur, row, url): marketId = verifyMarketPlace(cur, row[0]) @@ -240,7 +259,7 @@ def create_vendor(cur, row, marketId): row[1], row[2] if row[2] != '-1' else None, row[3] if row[3] != '-1' else None, - row[21] if row[21] != '-1' else None, + imageId, row[23]] cur.execute(sql, recset) @@ -560,10 +579,10 @@ def create_database(cur, con): sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ - "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ + "null, image_vendor integer null, dateinserted_vendor timestamp(6) with time zone not null, " \ "constraint vendors_pk primary key (vendor_id), " \ "constraint vendors_market_id_fk foreign key (market_id) references marketplaces (market_id), " \ - "constraint vendors_image_id_fkey foreign key (image_id) references market_images (image_id))" + "constraint vendors_image_id_fkey foreign key (image_vendor) references market_images (image_id))" cur.execute(sql) sql = "create unique index unique_vendor ON vendors USING btree (market_id ASC NULLS LAST, name_vendor ASC NULLS LAST)" @@ -571,11 +590,11 @@ def create_database(cur, con): sql = "create table vendors_history(vendor_id integer not null, version_vendor integer not null, market_id integer not null, name_vendor " \ "character varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor " \ - "integer null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ + "integer null, image_vendor integer null, dateinserted_vendor timestamp(6) with time zone not null, " \ "constraint vendors_history_pk primary key (vendor_id, version_vendor), " \ "constraint vendors_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \ "constraint vendors_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ - "constraint vendors_history_image_id_fkey foreign key (image_id) references market_images (image_id))" + "constraint vendors_history_image_id_fkey foreign key (image_vendor) references market_images (image_id))" cur.execute(sql) sql = "create table items(item_id integer not null, market_id integer not null, vendor_id integer not null, name_item character " \ @@ -585,13 +604,13 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, image_id integer null, href_item character varying(255) not null, " \ + "varying(255) null, image_item integer null, href_item character varying(255) not null, " \ "lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \ "classification_item double precision null, " \ "constraint items_pk primary key (item_id), " \ "constraint items_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ "constraint items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \ - "constraint items_image_id_fkey foreign key (image_id) references market_images (image_id))" + "constraint items_image_id_fkey foreign key (image_item) references market_images (image_id))" cur.execute(sql) sql = "create unique index unique_item ON items USING btree (market_id ASC NULLS LAST, href_item ASC NULLS LAST)" @@ -604,14 +623,14 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, image_id integer null, href_item character varying(255) not null, " \ + "varying(255) null, image_item integer null, href_item character varying(255) not null, " \ "lastseen_item timestamp(6) with time zone not null, dateinserted_item timestamp(6) with time zone not null, " \ "classification_item double precision null, " \ "constraint items_history_pk primary key (item_id, version_item), " \ "constraint items_history_item_id_fkey foreign key (item_id) references items (item_id), " \ "constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ "constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \ - "constraint items_history_image_id_fkey foreign key (image_id) references market_images (image_id))" + "constraint items_history_image_id_fkey foreign key (image_item) references market_images (image_id))" cur.execute(sql) con.commit()