Browse Source

BestCarding + DbConnection Create Tables.

main
ericssonmarin-cpp 1 year ago
parent
commit
54fddc3df9
2 changed files with 46 additions and 40 deletions
  1. +29
    -23
      Forums/BestCardingWorld/parser.py
  2. +17
    -17
      Forums/DB_Connection/db_connection.py

+ 29
- 23
Forums/BestCardingWorld/parser.py View File

@ -18,15 +18,15 @@ def bestcardingworld_description_parser(soup):
topic = "-1" # 0 topic name
user = [] # 1 all users of each post
addDate = [] # 2 all dated of each post
feedback = [] # 3 all feedbacks of each user (this was found in just one Forum and with a number format)
status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 5 all users's karma in each post (usually found as a number)
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 7 all messages of each post
interest = [] # 8 all user's interest in each post
image = []
image_user = []
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all users's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format)
addDate = [] # 8 all dated of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
@ -157,15 +157,18 @@ def bestcardingworld_description_parser(soup):
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image.append(img)
image_post.append(img)
img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"})
img = img.get('src').split('base64,')[-1]
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
@ -179,17 +182,18 @@ def bestcardingworld_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page
def bestcardingworld_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
nm = 0 # *this variable should receive the number of topics
forum = "BestCardingWorld" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
@ -243,6 +247,8 @@ def bestcardingworld_listing_parser(soup):
user = ps.strip()
author.append(cleanString(user))
image_author.append(-1)
# Finding the number of replies
post = replies[index].text.split()[0]
post = post.strip()
@ -263,7 +269,7 @@ def bestcardingworld_listing_parser(soup):
index += 1
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
#called by the crawler to get description links on a listing page


+ 17
- 17
Forums/DB_Connection/db_connection.py View File

@ -519,9 +519,9 @@ def create_database(cur, con):
cur.execute(sql)
sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \
"listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \
"CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \
"CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
"listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \
"constraint forums_status_pk PRIMARY KEY (forum_id, date_inserted), " \
"constraint forums_status_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
cur.execute(sql)
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
@ -529,7 +529,7 @@ def create_database(cur, con):
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
"image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_pk primary key (user_id), " \
"constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
"constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)"
@ -541,17 +541,17 @@ def create_database(cur, con):
"signature_user character varying(1000) null, image_user character varying(10000000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_history_pk primary key (user_id, version_user), " \
"constraint users_history_user_id_fkey foreign key (user_id) references " \
"users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
"constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \
"constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \
"foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \
"forum_id) references forums (forum_id))"
"constraint topics_pk primary key (topic_id), " \
"constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \
"constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \
@ -564,9 +564,9 @@ def create_database(cur, con):
"dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \
"classification_topic double precision NOT NULL, " \
"constraint topics_history_pk primary key (topic_id, version_topic), " \
"constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \
"constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))"
"constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \
"constraint topics_history_board_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \
@ -574,8 +574,8 @@ def create_database(cur, con):
"image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_pk primary key (post_id), " \
"constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \
"posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))"
"constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id))"
cur.execute(sql)
sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \
@ -587,9 +587,9 @@ def create_database(cur, con):
"image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_history_pk primary key (post_id, version_post), " \
"constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \
"constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \
"constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))"
"constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id))"
cur.execute(sql)
con.commit()


Loading…
Cancel
Save