#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 30 19:17:12 2021
@author: ledi
"""
import time
import parsel
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import requests
from lxml import etree
import datetime
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://accounts.douban.com/passport/login?source=group'
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 50)
browser.get(url)
time.sleep(30)
import time
data=[]
for pa in range(10000):
kkt=25*pa
url='https://www.douban.com/group/707669/discussion?start='+str(kkt)
# url = 'https://www.douban.com/group/707669/'
browser.get(url)
page = browser.page_source
print(page)
soup = BeautifulSoup(page, "lxml")
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签
div_list = soup.find_all('td', class_='title')
# import time
# data=[]
dd=pd.read_html(page)[1].values
time.sleep(3)
for k in range(len(div_list)):
print(div_list[k])
c=str(div_list[k].a).split()
print(c)
print(len(c))
print('############')
c1=c[2].split('=')
c2=c1[1].split('"')
temp=[c2[1],c[-2],dd[:,2][k]]
# temp_html= requests.get(temp[0], headers=headers).text
browser.get(temp[0])
temp_html = browser.page_source
et_html = etree.HTML(temp_html)
# # 查找所有class属性为hd的div标签下的a标签的第一个span标签
urls = et_html.xpath("""//*[@id="topic-content"]/div[2]/h3/span[2]""")
this_time=[each.text.strip() for each in urls]
kkp=temp+this_time
data.append(kkp)
print(kkp)
time.sleep(0.2)
# time.sleep(1)
# result = []