Its All About The Logic: Findall Posts & The Corresponding Threads - On Vbulletin
vbulletin Main Goal At the end we have all the threads (and discourses) where our demo-user is involved. (Note: This means that we should keep in mind a nice presentation of the ga
Solution 1:
See if this gets you started.
Can make some functions that will pull out the thread ids, using the postIds. then iterate through the thread Id pages and parse the data. I'm not really going to spend too much more time on this. You could possibly use the comments in the html as well to pull out some of the sections, but I think this is more or less the thought process you are looking for.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
userId = 4793defget_user_stats(userId):
url = 'https://forums.sagetv.com/forums/member.php'
payload = {'u':f'{userId}'}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
stats_data = {}
stats = soup.find('fieldset',{'class':"statistics_group"}).find_all('li')
for each in stats:
values = each.text.replace(',','').split(':')
iflen(values) == 2:
key, value = ''.join(values[0].split()), float(values[1])
stats_data[key] = value
return stats_data
defget_searchId(userId):
url = 'https://forums.sagetv.com/forums/search.php?do=finduser&u=4793'
payload = {'do':'finduser',
'u':f'{userId}'}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
searchId = soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).find_next('a')['href'].split('searchid=')[-1].split('&')[0]
return searchId
defget_page_threadIds(threadId_list, soup):
postIds = soup.find_all('table',{'id':re.compile("^post")})
for each in postIds:
a_s = each.find_all('a')
for alpha in a_s:
if't='in alpha['href']:
threadId = alpha['href'].split('t=')[-1]
if threadId notin threadId_list:
threadId_list.append(threadId)
return threadId_list
defget_all_threadIds(searchId):
threadId_list = []
url = 'https://forums.sagetv.com/forums/search.php'
payload = {'searchid':'%s' %searchId,
'pp':'200'}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
total_pages = int(soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).text.split('of ')[-1])
threadId_list = get_page_threadIds(threadId_list, soup)
for page inrange(2, total_pages+1):
payload.update({'page': '%s' %page})
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
threadId_list += get_page_threadIds(threadId_list, soup)
returnlist(set(threadId_list))
stats = get_user_stats(userId)
searchId = get_searchId(userId)
threadId_list = get_all_threadIds(searchId)
rows = []
for threadId in threadId_list:
url = 'https://forums.sagetv.com/forums/showthread.php'
payload = {'t':'%s' %threadId,
'pp':'40',
'page':'1'}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
try:
total_pages = int(soup.find('td',{'class':'vbmenu_control'}, text=re.compile("^Page 1 of")).text.split('of ')[-1])
except:
total_pages=1for page inrange(1,total_pages+1):
payload.update({'page':'%s' %page})
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
discussion = soup.find('td',{'class':'navbar'}).text.strip()
posts = soup.find_all('table',{'id':re.compile("^post")})
for post in posts:
dateStr = post.find('td',{'class':'thead'}).text.split()
postNo = dateStr[0]
dateStr = ' '.join(dateStr[1:])
postername = post.find('a',{'class':'bigusername'}).text
joinDate = post.find('div', text=re.compile("^Join Date:")).text.split('Join Date:')[-1].strip()
try:
location = post.find('div', text=re.compile("^Location:")).text.split('Location:')[-1].strip()
except:
location = 'N/A'
postNum = post.find('div', text=re.compile(".*Posts:")).text.split('Posts:')[-1].replace(',','').strip()
message = post.find('div',{'id':re.compile("^post_message_")}).text.strip()
row = {'date':dateStr,
'postNumber':postNo,
'poster':postername,
'joinDate':joinDate,
'location':location,
'number of posts':postNum,
'thread':discussion,
'thread id':threadId,
'message':message}
rows.append(row)
print ('Collected: %s - Page %0s of %s' %(discussion, page,total_pages))
df = pd.DataFrame(rows)
print (stats)
print(df)
Output:
Post a Comment for "Its All About The Logic: Findall Posts & The Corresponding Threads - On Vbulletin"