[Python] 纯文本查看 复制代码 import re
# 读取网页源码文件
with open('webpage.html', 'r', encoding='utf-8') as file:
html_content = file.read()
# 提取HTML标签属性的URL(避免匹配到残缺的URL)
url_pattern = re.compile(
r'\b(?:href|src)\s*=\s*["\']' # 匹配 href=" 或 src="
r'(?P<url>' # 捕获组命名
r'(?:https?://|//)' # http://, https://, //
r'[^\s"\'>]+' # 匹配URL部分
r')["\']', # 结束引号
re.IGNORECASE
)
# 提取所有匹配的URL
urls = [url_match.group('url') for url_match in url_pattern.finditer(html_content)]
# 去重并排序(可选)
unique_urls = sorted(set(urls))
# 打印提取到的URL
print("提取到的网址列表:")
for i, url in enumerate(unique_urls, 1):
print(f"{i}. {url}")
print(f"\n总计发现 {len(unique_urls)} 个网址")
给你写了个python的demo,直接从源码匹配 |