python简单爬虫代码,python入门_python爬虫根底

python简单爬虫代码,python入门_python爬虫根底j="[{\"name\":\"张三\",\"sex\":\"男

数据处理

json数据处理

import json
j = "[{\"name\":\"张三\",\"sex\":\"男\"},{\"name\":\"张三\",\"sex\":\"男\"},{\"name\":\"张三\",\"sex\":\"男\"},{\"name\":\"张三\",\"sex\":\"男\"}]"
obj = json.loads(j)#将json 转为 字典
s = json.dumps(obj,ensure_ascii=False)#将字典转为json
print(s)

代码100分

eg.

代码100分import requests
import json
url = "https://www.toutiao.com";
url_json = "https://www.toutiao.com/api/pc/focus/"
r = requests.get(url_json)
r.encoding="utf-8"
text = r.text.encode("utf-8").decode("unicode-escape");
obj = json.loads(text)
list = []
for item in obj["data"]["pc_feed_focus"]:
 dic = {}
 dic["name"] = item["title"]
 dic["url"] = url+item["display_url"]
 list.append(dic)
print(list)
with open("d://text.txt",mode="w+",encoding="utf-8") as f:
 for item in list:
 text = "%s\n%s\n\n"%(item["name"],item["url"])
 f.writelines(text)
 f.close()
print("保存成功")
​

正则表达式

  • 常用的正则表达式方法
  • re.compile(编译)
  • pattern.match(从头找一个)
  • pattern.search(找一个)
  • pattern.findall(找所有)
  • pattern.sub(替换)
#获取所有的<a>标签中的href和内容
response = requests.get(url,headers = header,verify=False)
response.encoding = "gbk"
text = response.text
list = re.findall("<a.*?href=\"(.*?)\".*?>(.*?)</a>",text)

爬取新闻试试

代码100分import requests
import json
url = "https://www.toutiao.com";
url_json = "https://www.toutiao.com/api/pc/feed/"
​
def get_new(url,page):
 print("第%d页数据\n"%page)
 par = {"min_behot_time":"0",
 "category":"__all__",
 "utm_source":"toutiao",
 "widen":page,
 "tadrequire":"true",
 "as":"A1250B9CAD1D50B",
 "cp":"5BCD3D15503B1E1",
 "_signature":"RKT9sBAZH2s.nZ52VHIiUkSk.a"
 }
​
 header = {
 "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 "accept-encoding":"gzip, deflate, sdch, br",
 "accept-language":"zh-CN,zh;q=0.8",
 "cookie":"uuid=\"w:dcbd0242a9fe484383c6636fe20d060f\"; _ga=GA1.2.1506896000.1534144404; __tea_sdk__ssid=8e3979d0-0146-4a09-981e-4bf22ddcd8da; __tea_sdk__user_unique_id=6614851142792824324; ccid=0889c1662d7c32cf7cfbbaed374fff41; UM_distinctid=16699804cee25c-033cb3b5725eef-2b6f686a-1fa400-16699804cef351; login_flag=5a14c20d209ab052a7978c7086705592; sid_tt=03441da17a2258d893fe14bdc8f89104; sessionid=03441da17a2258d893fe14bdc8f89104; part=stable; sid_guard=\"03441da17a2258d893fe14bdc8f89104|1540173659|2592000|Wed\054 21-Nov-2018 02:00:59 GMT\"; uid_tt=91c220e0eedcd3bbf0f18bf313ce1991; odin_tt=46e7201ef0566449f12c3109272fb067fa703d42c8dc1206cbf5c0230dd35a0ab1f3cf871381c186e229dfd246e5bdc2; tt_webid=6614851142792824324; WEATHER_CITY=%E5%8C%97%E4%BA%AC; CNZZDATA1259612802=1078251197-1540189911-%7C1540211511; __tasessionId=2b1oj444a1540216055134; csrftoken=e26f54e05d0eaaf3697b65190c17d652; tt_webid=6614851142792824324",
 "upgrade-insecure-requests":"1",
 "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
 r = requests.get(url_json,params = par,headers = header)
 r.encoding="utf-8"
 text = r.text.encode("utf-8").decode("unicode-escape");
 print(text)
 obj = json.loads(text)
 list = []
 if len(obj["data"]) >0:
 for item in obj["data"]:
 dic = {}
 dic["name"] = item["title"]
 dic["url"] = url+item["source_url"]
 list.append(dic)
 
 with open("d://text.txt",mode="a+",encoding="utf-8") as f:
 for item in list:
 text = "%s\n%s\n\n"%(item["name"],item["url"])
 f.writelines(text)
 f.close()
 get_new(url_json,page+1)
 return
 else:
 print("保存成功")
 return
​
get_new(url_json,1)
print("成功")

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
转载请注明出处: https://daima100.com/4209.html

(0)
上一篇 2022-12-14
下一篇 2023-04-01

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注