Initial commit
This commit is contained in:
140
skills/plan-video/scripts/extract_video_id.py
Normal file
140
skills/plan-video/scripts/extract_video_id.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract VideoId from various video platform URLs.
|
||||
|
||||
Supports: 小红书, 抖音, TikTok, B站, YouTube, 快手等
|
||||
"""
|
||||
|
||||
import re
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> str:
|
||||
"""
|
||||
Extract VideoId from video platform URL.
|
||||
|
||||
Args:
|
||||
url: Video URL from supported platforms
|
||||
|
||||
Returns:
|
||||
VideoId string
|
||||
|
||||
Raises:
|
||||
ValueError: If URL format is not recognized
|
||||
"""
|
||||
url = url.strip()
|
||||
|
||||
# 小红书短链接: http://xhslink.com/o/6VbNVltFQRX
|
||||
if 'xhslink.com' in url or 'xiaohongshu.com' in url:
|
||||
match = re.search(r'/o/([A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# 抖音: https://v.douyin.com/xxx/ or https://www.douyin.com/video/xxx
|
||||
if 'douyin.com' in url:
|
||||
# 短链接
|
||||
match = re.search(r'v\.douyin\.com/([A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# 长链接
|
||||
match = re.search(r'/video/(\d+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# TikTok: https://www.tiktok.com/@user/video/1234567890
|
||||
if 'tiktok.com' in url:
|
||||
match = re.search(r'/video/(\d+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# 短链接 https://vm.tiktok.com/xxx/
|
||||
match = re.search(r'vm\.tiktok\.com/([A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# B站: https://www.bilibili.com/video/BVxxx or https://b23.tv/xxx
|
||||
if 'bilibili.com' in url or 'b23.tv' in url:
|
||||
# BV号
|
||||
match = re.search(r'/(BV[A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# av号
|
||||
match = re.search(r'/av(\d+)', url)
|
||||
if match:
|
||||
return f"av{match.group(1)}"
|
||||
# 短链接
|
||||
match = re.search(r'b23\.tv/([A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# YouTube: https://www.youtube.com/watch?v=xxx or https://youtu.be/xxx
|
||||
if 'youtube.com' in url or 'youtu.be' in url:
|
||||
# youtu.be 短链接
|
||||
match = re.search(r'youtu\.be/([A-Za-z0-9_-]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# 标准链接
|
||||
parsed = urlparse(url)
|
||||
if parsed.query:
|
||||
params = parse_qs(parsed.query)
|
||||
if 'v' in params:
|
||||
return params['v'][0]
|
||||
|
||||
# 快手: https://www.kuaishou.com/short-video/xxx
|
||||
if 'kuaishou.com' in url:
|
||||
match = re.search(r'/short-video/(\d+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
match = re.search(r'\.com/([A-Za-z0-9]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# 如果都不匹配,尝试提取 URL 最后的路径部分
|
||||
parsed = urlparse(url)
|
||||
path_parts = [p for p in parsed.path.split('/') if p]
|
||||
if path_parts:
|
||||
# 返回最后一个非空路径部分
|
||||
return path_parts[-1]
|
||||
|
||||
raise ValueError(f"无法从 URL 提取 VideoId: {url}")
|
||||
|
||||
|
||||
def extract_urls(text: str) -> list[str]:
|
||||
"""
|
||||
Extract video URLs from text.
|
||||
|
||||
Args:
|
||||
text: Text containing video URLs
|
||||
|
||||
Returns:
|
||||
List of unique video URLs
|
||||
"""
|
||||
pattern = r'https?://[^\s]+(?:douyin|tiktok|youtube|youtu\.be|xiaohongshu|xhslink|bilibili|b23\.tv|kuaishou)[^\s]*'
|
||||
urls = re.findall(pattern, text)
|
||||
return list(dict.fromkeys(urls)) # 保持顺序去重
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
test_urls = [
|
||||
'http://xhslink.com/o/6VbNVltFQRX',
|
||||
'http://xhslink.com/o/16X9vM9CTBo',
|
||||
'https://v.douyin.com/eFyQRjc/',
|
||||
'https://www.douyin.com/video/7123456789',
|
||||
'https://www.tiktok.com/@user/video/7123456789',
|
||||
'https://vm.tiktok.com/ZMeAbCdEf/',
|
||||
'https://www.bilibili.com/video/BV1xx411c7mD',
|
||||
'https://b23.tv/av12345678',
|
||||
'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
|
||||
'https://youtu.be/dQw4w9WgXcQ',
|
||||
'https://www.kuaishou.com/short-video/1234567890',
|
||||
]
|
||||
|
||||
print("VideoId 提取测试:\n")
|
||||
for url in test_urls:
|
||||
try:
|
||||
video_id = extract_video_id(url)
|
||||
print(f"✅ {url}")
|
||||
print(f" → {video_id}\n")
|
||||
except ValueError as e:
|
||||
print(f"❌ {url}")
|
||||
print(f" → {e}\n")
|
||||
Reference in New Issue
Block a user