写在题前:之前搞Java的,今天部门的人给了我一批视频链接,问问有没有办法爬出来这一批视频链接的博主的粉丝量,以及该视频的互动率等信息。经过一番探索之后了解了insloader库。真不不得不感叹python的强大。
之前给代码把:
import instaloader
from urllib.parse import urlparse
import time
import csv
import os
# Instagram链接列表
INSTAGRAM_URLS = [
"示例链接",
"示例链接",
]
def extract_shortcode(url):
"""
从Instagram链接中提取短代码(shortcode)
:param url: Instagram帖子链接
:return: 短代码或None
"""
try:
parsed_url = urlparse(url)
path_parts = parsed_url.path.strip('/').split('/')
if len(path_parts) >= 2:
return path_parts[-1]
elif len(path_parts) == 1:
return path_parts[0]
else:
return None
except Exception as e:
print(f"URL: {url} - 短代码提取失败: {e}")
return None
def fetch_post_data(shortcode, loader):
"""
通过短代码获取帖子数据
:param shortcode: 帖子短代码
:param loader: Instaloader实例
:return: 字典包含用户名、粉丝量、观看量、点赞数、评论数、互动率、发布时间 或 None
"""
try:
post = instaloader.Post.from_shortcode(loader.context, shortcode)
username = post.owner_username
followers = post.owner_profile.followers
# 根据帖子类型获取观看量
if post.is_video:
view_count = post.video_view_count
else:
view_count = None # 对于图片帖子,没有观看量
likes = post.likes
comments = post.comments
# 计算互动率
if followers > 0:
interaction_rate = (likes + comments) / followers * 100
else:
interaction_rate = None
# 获取发布时间
post_date = post.date.strftime("%Y-%m-%d %H:%M:%S")
return {
"Username": username,
"Followers": followers,
"View Count": view_count,
"Likes": likes,
"Comments": comments,
"Interaction Rate (%)": round(interaction_rate, 2) if interaction_rate is not None else None,
"Post Date": post_date
}
except instaloader.exceptions.InstaloaderException as e:
print(f"短代码: {shortcode} - 错误: {e}")
return None
def main():
# 初始化Instaloader
loader = instaloader.Instaloader()
# (可选)登录Instagram账号以提高成功率
# 如果需要抓取私密帖子或提高抓取成功率,请取消下行注释并填写您的Instagram用户名和密码
# 注意:频繁自动化登录可能违反Instagram的服务条款
# loader.login('your_username', 'your_password')
# 存储结果
results = []
for url in INSTAGRAM_URLS:
shortcode = extract_shortcode(url)
if shortcode:
post_data = fetch_post_data(shortcode, loader)
if post_data:
results.append(post_data)
print(
f"URL: {url} - Username: {post_data['Username']} - Followers: {post_data['Followers']} - Views: {post_data['View Count']} - Likes: {post_data['Likes']} - Comments: {post_data['Comments']} - Interaction Rate: {post_data['Interaction Rate (%)']}% - Post Date: {post_data['Post Date']}")
else:
results.append({
"Username": "Username not found",
"Followers": "N/A",
"View Count": "N/A",
"Likes": "N/A",
"Comments": "N/A",
"Interaction Rate (%)": "N/A",
"Post Date": "N/A"
})
print(f"URL: {url} - Username not found.")
else:
results.append({
"Username": "Invalid URL or shortcode",
"Followers": "N/A",
"View Count": "N/A",
"Likes": "N/A",
"Comments": "N/A",
"Interaction Rate (%)": "N/A",
"Post Date": "N/A"
})
print(f"URL: {url} - 无法提取短代码。")
# 为了避免被Instagram封锁,暂停一段时间
time.sleep(2) # 暂停2秒
# 将结果保存到CSV文件
csv_filename = "instagram_posts_data.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
fieldnames = ["URL", "Username", "Followers", "View Count", "Likes", "Comments", "Interaction Rate (%)",
"Post Date"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for idx, url in enumerate(INSTAGRAM_URLS):
if idx < len(results):
data = results[idx]
writer.writerow({
"URL": url,
"Username": data["Username"],
"Followers": data["Followers"],
"View Count": data["View Count"],
"Likes": data["Likes"],
"Comments": data["Comments"],
"Interaction Rate (%)": data["Interaction Rate (%)"],
"Post Date": data["Post Date"]
})
else:
writer.writerow({
"URL": url,
"Username": "Data not available",
"Followers": "N/A",
"View Count": "N/A",
"Likes": "N/A",
"Comments": "N/A",
"Interaction Rate (%)": "N/A",
"Post Date": "N/A"
})
if __name__ == "__main__":
main()
输出结果如下:
标签:Username,url,shortcode,insloader,取博主,点赞,post,data,Post From: https://blog.csdn.net/dnuiking/article/details/144433030