作者jk808631 (99527)
看板Python
標題[問題] 請教一個程式作業
時間Mon Jun 30 16:27:37 2014
這是期末考的補救機會作業
老師要我完成的是剩下的部份,底下說明以後的部分
import urllib.request
from bs4 import BeautifulSoup
def getText(url, encoding='utf-8'):
#url = '
http://www.voafanti.com/gate/big5/www.voachinese.com/content/lw1939-pale-in-comparison/1825297.html'
html = urllib.request.urlopen(urllib.request.Request(url))
soup = BeautifulSoup(html, from_encoding=encoding)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
import re
def getVOA(url):
res=getText(url)
lst = re.split(r'\n', res)
text=''
first_hint=False #'列印'
second_hint=False #'美國之音'
start=True
for e in lst:
if re.match(r'列印', e):
if second_hint:
second_hint=False
else:
first_hint=True
continue
if first_hint and re.match(r'美國之音', e):
second_hint=True
continue
if second_hint and re.match(r'學個詞-\d+-\w+', e):
start=True
if second_hint and start:
text+=e
return text
urls=['
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTktc3RpY2tlci1zaG9jaw~~/1943689.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTgtZ3JhY2UtcGVyaW9k/1943688.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTctY2l2aWwtd2Fy/1943687.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTYtZGlzcGFyYWdl/1943685.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTUtcHJvaGliaXQ~/1939100.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTQtc3dpdGNo/1939098.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTMtdm9pY2U~/1939094.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTItbWFzY290/1939093.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTEtZXhjaGFuZ2U~/1939092.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTAtYnJlYWR3aW5uZXI~/1935520.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDktYW5vbnltb3Vz/1935516.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDgtZHJhZnQ~/1935513.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDctaWRlbnRpZnk~/1935511.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDYtbmF0aW9ud2lkZQ~~/1935509.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTMzNy1jaGFyaXR5LQ~~/1933985.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDUtY29udHJpYnV0aW9u/1928911.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDQtY29udGFnaW91cw~~/1928909.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDMtYXNzZXNz/1928907.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDItZ3JhZmZpdGk~/1928906.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDEtZnVuZGluZw~~/1928904.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDAtYWNjb21wbGlzaG1lbnQ~/1925331.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzktcHVibGljLXRyYW5zaXQ~/1925330.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzgtZGF0YWJhc2U~/1925329.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzctaGVhcmluZw~~/1925327.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzYtcmFudA~~/1925325.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/media/video/1936377.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzUtcHJvZm91bmQ~/1919322.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzQtcGxhbi1i/1919321.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzMtdG94aWM~/1919314.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzItb24tdGhlLWJyaW5rLW9m/1919312.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzEtY29tcGVs/1919311.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzAtbWF4LW91dA~~/1914530.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjktc2NyZWVuLXRpbWU~/1914527.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjgtdW5leHBlY3RlZA~~/1914522.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjctZGl2ZXJzZQ~~/1914519.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjYtd2lkZS1yYW5naW5n/1914515.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjUtYXQtbm8tY2hhcmdl/1914512.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjQtcmVoYWItY2VudGVy/1914508.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjMtY29tcGxhaW50/1914506.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjItc3VzcGljaW9u/1914504.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjEtb3V0bGF3/1914503.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/media/video/1936263.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjAtbGV0aGFs/1904640.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTktcG9pc2Vk/1904636.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTgtbWFyaXRpbWU~/1904632.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTctc3VzdGFpbmFiaWxpdHk~/1904630.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base48-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTYtYmFy/1904626.html',
'
http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTUtc2tlbGV0b24~/1899500.html']
#撰寫迴圈將urls中的每一個連結的文字內容個別存入一個文字檔
#文字檔檔名以連結的檔名為檔名, 附檔名則將html改成txt.
#例如urls[0]的檔名為1943689, 故存成的文字檔必須是1943689.txt
#以下示範程式可以顯示 1943689的文字內容, 但你要寫迴圈來批次
#讀取與寫入內容. 完成後請email給我程式檔及所擷取的文字檔.(可以用zip壓縮)
想請教怎麼把文字檔檔名儲存成每個網址後面的數字
迴圈的部分也不太懂到底該怎麼寫....
各位可以救救我嗎
-------------------------------
我這樣寫可以嗎
for name in urls:
print(name[-12:-5])
然後要怎麼將每個網址的文字內容都另存一個文字檔
--
※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 118.232.162.101
※ 文章網址: http://www.ptt.cc/bbs/Python/M.1404116894.A.FEC.html
※ 編輯: jk808631 (118.232.162.101), 06/30/2014 16:43:21
→ uranusjr:已經是補救機會還只能問人, 我看還是明年再來對你比較好 06/30 16:43
→ ck574b027:google "python 另存一個文字檔" 很困難嗎? 06/30 21:47
推 yauhh:for name in urls: ... 那樣可以,只要你假定網址都那個規格 06/30 21:52
推 goldflower:你迴圈用open應該就能直接建立新的文件 然後再寫進去 07/01 17:00
→ goldflower:比如f = open('hello.txt','w') 會建立一個叫hello的檔 07/01 17:02
→ goldflower:菇狗一下 "文件讀寫 python" 應該會有很多資料 07/01 17:03