diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py index 7ad385b..3af4f87 100644 --- a/Forums/BestCardingWorld/parser.py +++ b/Forums/BestCardingWorld/parser.py @@ -18,15 +18,15 @@ def bestcardingworld_description_parser(soup): topic = "-1" # 0 topic name user = [] # 1 all users of each post - addDate = [] # 2 all dated of each post - feedback = [] # 3 all feedbacks of each user (this was found in just one Forum and with a number format) - status = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 5 all users's karma in each post (usually found as a number) - sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 7 all messages of each post - interest = [] # 8 all user's interest in each post - image = [] - image_user = [] + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all users's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format) + addDate = [] # 8 all dated of each post + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) @@ -157,15 +157,18 @@ def bestcardingworld_description_parser(soup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"}) - img = img.get('src').split('base64,')[-1] + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -179,17 +182,18 @@ def bestcardingworld_description_parser(soup): #return: 'row' that contains a variety of lists that each hold info on the listing page def bestcardingworld_listing_parser(soup): - nm = 0 # *this variable should receive the number of topics + nm = 0 # *this variable should receive the number of topics forum = "BestCardingWorld" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + image_author = [] # 8 all author avatars used in each topic # Finding the board (should be just one) @@ -243,6 +247,8 @@ def bestcardingworld_listing_parser(soup): user = ps.strip() author.append(cleanString(user)) + image_author.append(-1) + # Finding the number of replies post = replies[index].text.split()[0] post = post.strip() @@ -263,7 +269,7 @@ def bestcardingworld_listing_parser(soup): index += 1 - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) #called by the crawler to get description links on a listing page diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index dfdec49..6cc9c60 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -519,9 +519,9 @@ def create_database(cur, con): cur.execute(sql) sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ - "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ - "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ - "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \ + "constraint forums_status_pk PRIMARY KEY (forum_id, date_inserted), " \ + "constraint forums_status_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" cur.execute(sql) sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ @@ -529,7 +529,7 @@ def create_database(cur, con): "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ "image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_pk primary key (user_id), " \ - "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" @@ -541,17 +541,17 @@ def create_database(cur, con): "signature_user character varying(1000) null, image_user character varying(10000000) null, " \ "dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_history_pk primary key (user_id, version_user), " \ - "constraint users_history_user_id_fkey foreign key (user_id) references " \ - "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \ - "constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ - "foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \ - "forum_id) references forums (forum_id))" + "constraint topics_pk primary key (topic_id), " \ + "constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \ + "constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \ @@ -564,9 +564,9 @@ def create_database(cur, con): "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ "classification_topic double precision NOT NULL, " \ "constraint topics_history_pk primary key (topic_id, version_topic), " \ - "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ - "constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \ + "constraint topics_history_board_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ @@ -574,8 +574,8 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_pk primary key (post_id), " \ - "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ - "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" + "constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id))" cur.execute(sql) sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \ @@ -587,9 +587,9 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_history_pk primary key (post_id, version_post), " \ - "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ - "constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))" + "constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id))" cur.execute(sql) con.commit()