本帖最后由 xiaoxue 于 2019-3-4 20:51 编辑 8 n& o' A! X, S `' m
- 7 F z( o- {$ t7 g q, C
- # -*- coding: UTF-8 -*-1 b, Q; [ ^& x+ M
- import requests
6 z4 N% j% C. F% z6 E7 g& q - import os
; K& S* _, [* m& v - import subprocess
5 ?0 B1 O. N/ G: {! k5 u - import sys
% l6 |4 F3 T* [4 i4 X! Q' _2 a - from tqdm import tqdm5 X ~' C6 A( r* H6 k2 e
- import time,random* W( \7 O' x* m3 Y- b3 K
- from HTMLParser import HTMLParser
* o, B3 V# {; H( } H$ f - import sys
3 @/ B; @6 i6 Z# v- f - reload(sys) . p$ m1 {* O- ?+ g* G0 o2 V
- sys.setdefaultencoding('utf8')
' Z% B3 L: E' s
; ~8 l* I! J Z7 q- class WebRequest(object):1 @0 f7 L8 c/ A6 H
- def __init__(self, *args, **kwargs):
! ?7 O0 M; L! @2 ?) @8 _& j! E - pass+ t. h% n3 l/ L) u0 A+ e
( A8 b4 q4 Y: n, ]- @property
; N# p# p* F, T# y& D* m - def user_agent(self):! D8 T7 t# C! d
- """
, d% I; g! I8 V: U: M% H; o$ v - return an User-Agent at random6 o3 R- Z4 r/ k6 z3 z p
- :return:% `8 M7 Y7 H0 Q M% A5 @9 [
- """. G; m; @$ Y5 y1 m8 t4 U8 v
- ua_list = [6 a3 B( L# g$ _3 f' H1 y7 [3 C
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',! a& H1 i7 ]6 y% a8 d) C: d
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',& c0 [* I3 e, o, I
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
W; [2 \, u' n; T( m - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',& w' l& E2 o" c+ n5 R2 ]! m
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',! r- m. M9 a% x6 J' [$ v
- 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',2 E5 ^7 P9 _- }7 u. O6 n
- 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',/ Y' V k L& j; i) ]- V$ C
- 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',$ F- T# K& O+ r: T9 q# I
- ] {- ^$ u) P: T8 q8 c" t* f/ x) u
- return random.choice(ua_list)
# f' a& W8 C6 C/ N) z% @! ]1 X( B
1 T# f) O W0 z0 P* B- @property
# }" l" y: L9 D; z6 f6 ] - def header(self):
0 t+ e2 @& ^/ `7 Z - """
" u% J1 ~3 U" J: K* T9 J - basic header4 O' }. L: s2 p0 P9 F4 z4 |! }
- :return:
, k# S! w, R; h3 X2 N - """
- C: S9 q2 b- _* o( z - return {'User-Agent': self.user_agent,; q+ H& C% s/ V6 Q
- 'Accept': '*/*',
- x+ q0 Q; Q1 M$ E3 V! L8 Y @ - 'Connection': 'keep-alive',1 A% t. \! v9 j8 e" n2 ^! ^
- 'Accept-Language': 'zh-CN,zh;q=0.8'}& Q' W! W i* H7 Z
+ h3 j- n5 n! I- def get(self, url, header=None, retry_time=5, timeout=30, q9 ^4 m8 q9 D8 y8 e# x
- retry_flag=list(), retry_interval=5, *args, **kwargs):- w+ p% U& \; k- r1 R5 ^; E
- """
0 `4 }9 U$ G) t' P8 D - get method
; O4 T9 i& H' R7 B7 m# ` - :param url: target url
+ w: X' M5 @1 Z3 ]1 U E0 \- J, q - :param header: headers
8 ~ D* c3 |7 A0 h- | - :param retry_time: retry time when network error
; C' i3 T5 D8 D9 M* N - :param timeout: network timeout1 n) K' H9 \+ _& v2 ~ s
- :param retry_flag: if retry_flag in content. do retry
7 r' [& _* l; U2 K - :param retry_interval: retry interval(second), C& H, _5 z, X
- :param args:
/ S9 `1 f2 o A* M' s - :param kwargs:
: P) {4 \# s' R7 F - :return:6 ]* I' L" x1 f' D4 u' L
- """2 f1 D4 Y" {% b$ E! j; I
- headers = self.header
$ l( v5 k8 H* R. A: j& { - if header and isinstance(header, dict):
: c8 [- g' e- K* u& [4 i - headers.update(header)
. S" d) G% b& d- R9 K - while True:
7 F+ a8 ?. W4 v1 q$ U/ L - try:, j: E( v0 g& @
- html = requests.get(url, headers=headers, timeout=timeout, stream=True)
8 y; U( I; C# E" \3 L6 c* ?1 W - print 'content size: %d' % len(html.content)7 y: V) U! n1 e1 I) g: b
- if any(f in html.content for f in retry_flag):
- x3 \$ S8 ~4 }( `' d! q2 _2 E - raise Exception
! D/ D) U3 ^! \/ _' H - if not html.content:( \7 _+ G+ l. w2 o, H1 [; ?
- print 'content is Null,retry...~' + url
9 m$ T/ c; [6 i0 A - raise Exception% e e# j; Q/ G# U4 J3 Z% @$ r
- return html Q9 }* Q/ @* R
- except Exception as e:: @7 m5 i0 l* ?5 H' s
- print(e)
: L5 D$ I1 k1 ^0 s. _( o
1 h, J5 j8 t3 W) k/ w' e. _- + V* c3 {! p# \7 n
- def DownloadAudio(url, path, refer=None): ?% H& B: g# w8 S) c* @( ]
- with open(path, 'wb') as handle:0 @% M7 t% Y/ _+ S
- if refer:
* L6 F1 S% }1 D* E x - hdr = {
+ V- z4 i( [& K3 z s" W8 | - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',& M) D2 i: }0 U/ l4 T
- 'Referer': refer}2 l! U6 f$ R/ p, ?
- else:: w# I4 b2 E' K
- hdr = {% [" N' R/ l6 E: ?3 U; { B& ?
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
4 D3 ~% @% J6 w* m- `9 j - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',; ?4 B0 l! h$ ]
- 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
6 E4 Y% B# | M8 @6 L% b4 N - 'Accept-Encoding': 'none',8 I4 L. @ ~3 b9 U
- 'Accept-Language': 'en-US,en;q=0.8',
, V/ }# d, ^ S/ n. ^ - 'Connection': 'keep-alive'}
, N8 g, A a k1 R0 N - wr = WebRequest()) s+ a" i% k w9 W2 T6 H
- #print hdr- C7 m( [$ }: U! B
- response = wr.get(url,header=hdr)3 }. F8 P5 ?; Q9 ^; z8 c; y- A
- #html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy)})
+ D* B4 t! t; ^' r# H! b - #response = requests.get(url, stream=True, headers={'User-agent': 'Mozilla/5.0'})
+ w$ d; M% { C! A - #print url
+ ~: a+ J4 G# W" w - for block in tqdm(response.iter_content(),ascii=True, desc='ImageDownload'):
) ~* g/ g. R5 G - if not block:' n* m( e$ l3 M8 q- j; I% L/ V
- break+ Z- n/ B7 [1 }2 o5 V0 U' M e
- handle.write(block)" B7 w$ j8 A4 i# G V% @
( A8 S% ?5 X( P5 e( l6 P- def translate(to_translate, to_langage="auto", langage="auto"):+ y/ @1 @3 y/ c1 [; r
- '''Return the translation using google translate
, n- E# N3 P" p; @0 M - you must shortcut the langage you define (French = fr, English = en, Spanish = es, etc...)* ?- V9 z$ _6 q$ \* B+ ^3 v
- if you don't define anything it will detect it or use english by default1 W2 j) I' D/ o" v9 h8 w. e' y
- Example:6 R2 j/ E' b! l8 ?7 N& A
- print(translate("salut tu vas bien?", "en"))- b l9 R, m7 M0 K- p6 @, _7 @, Q$ u& _
- hello you alright?'''7 e O' {. G8 Q: V
- agents = {'User-Agent':"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"}
}, ^' |9 n8 J8 e$ F! o& x - before_trans = 'class="t0">'
( z x/ U; b" f$ }* R9 H, _, p, { - link = "http://translate.google.com/m?ie=UTF-8&hl=%s&sl=%s&q=%s" % (to_langage, langage, to_translate.replace(" ", "+"))
6 o* j- m1 Z7 b, g6 w6 ] - request = requests.get(link, headers=agents)
# j* L1 K( Q+ _+ W$ V/ ?# B - page = request.content2 n! N& W6 T& N8 ?) q( r
- #print page4 M* }* ^5 i" z) ^
- result = page[page.find(before_trans)+len(before_trans):]
9 @/ n! Y/ g$ u: z/ M9 _& F! L - result = result.split("<")[0]
& o( r# g( ~! z" j% K( k+ d: h7 z: E - return result" Z* P! q7 ]* i O8 i+ m' K
# b) `# s+ |1 J- tsxt_sen = []
+ @* p" `- n# h' V- }& L- D4 x - from nltk.tokenize import sent_tokenize |5 _$ J1 @9 x! {. i2 E# Y# B4 y
- def sentence_split(sentence): # 编写分句函数
7 u1 L' C* @5 r- h: ^! z - text_sen = []
' u0 ]) n2 ^* x - for s in sentence.split(','):
, n+ U! w. |3 `7 S - # if '?' in s:
# Z) S* i6 ]( E( |( B8 M) N - # tsxt_sen.extend(s.split('?'))6 _; n2 ^) j. r' I" D4 `( {
- # elif ',' in s:' _5 e( m! G) N- R& n. Q
- # tsxt_sen.extend(s.split(','))) p2 N( m$ Y, e; c/ N8 n
- # else:
( q. G8 j" P ?" A5 D - text_sen.append(s)% M+ }8 s% Q- Z
- return tsxt_sen7 B- ~! ^, E* S/ g& L5 [
# G- f* L' V! H% t- / e# C9 @" l/ `9 p. c6 \5 Y
- str = '''3 A- _" A2 j$ T% h) v# L$ {
- It has not quite been greeted with the enthusiasm of the bottles of Scotch in the novel Whisky Galore, but the arrival of a cargo of pineapples on the shores of northern Scotland has sent scavengers rushing to the beach.2 A# S. D( ~) P$ n
0 P( s8 k* O: V R& P' h+ `8 W- The fruit, which has appeared on Shetland and on beaches across the Western Isles, is thought to have come from several containers lost overboard in the Atlantic last October by cargo ship MV Lombok Strait.
" |5 T- h% s: Z/ ?% d: s4 Q( G" A
2 t; u( f/ _9 q5 [" @3 ~- Shetland islanders have now shared photos of their fruitful haul, in a story that echoes the plot of Whisky Galore, the 1947 novel by Sir Compton Mackenzie, later made and remade for the big screen.
: B2 b) F1 M+ A( Y# X1 c - 8 y9 F1 D4 \7 I
- The story tells of how the locals find their island awash with whisky after a ship loaded with liquor is wrecked off a fictional Scottish island.
`5 Z# f8 t9 @$ ~! h2 u
/ D% ?' A3 E, ]- The plot was based on the real-life salvage operations carried out by locals when the SS Politician, carrying 280,000 bottles of malt whisky, ran aground on Eriskay in 1943.
: B" {+ E; t+ N# G+ O* @' Q! i) p
+ Z1 d2 ]* g9 [# @$ f- @- The arrival of the pineapples will be far less lucrative for beachcombers, and not as profitable, either, as the grounding of the cargo ship MSC Napoli off Branscombe beach in Devon in 2007.
# f2 s6 T/ Y: d# k, d- Z3 j - % z! p6 h& y: O; K
- Then scavengers ignored police advice and arrived in their droves to explore shipping containers, which contained motorbikes, pet food, wine barrels and anti-wrinkle cream.& v7 z+ c& o6 `% r2 l
- '''
' m& b5 X+ i1 y, h
1 V6 [4 v! h, ?' a5 ]
9 [2 i6 k6 J+ d! \- text = ''.join(str).strip().lstrip().rstrip().replace('\n',' ')
6 @2 r! O S9 r \" `, r$ j8 C - with open('~/text.txt', 'wb') as handle:
+ N9 a: I u% I# w - handle.write(text)
( f) Q+ L! K1 S0 Q, P4 t& ?$ ?
9 n* a! G I) H3 ]- str = str.replace('”','"')4 j1 N, {1 s0 \) q+ T
- str = str.replace('“','"')6 ~2 m5 g- }/ V, {% G$ V
- 2 f$ |- Z) y. R o5 \6 N% f$ S6 z
q, x- d$ c+ P- o7 e' o+ U- def splitStr(str):
% e' W' i0 O0 X0 h, I - arr = str.split(' ')
* `- u3 M) C9 C' M# N* f - sz = len(arr)
! `. K0 k4 K! A% ? l% T- Y - return [' '.join(arr[0:sz/2]),' '.join(arr[(sz/2):])]1 C6 L( H4 u, j$ `
- ' R( ~+ u. H9 m9 h
. w. P: |' B; N- O" r- U- #print splitStr('with wording from North Korea about getting rid of its nuclear weapons and a guarantee from the United States that it would not interfere with the North’s regime or demand redress for human rights abuses.')
' R. l/ C& B4 S4 @5 o - , w) Z# _0 ?) x, z' A7 F
- keyArr = []5 D& x& k* e& i
- sent_tokenize_list = sent_tokenize(str)6 d# K5 i6 x* T% K h- g; B5 W
- for s in sent_tokenize_list:
7 `# K" o2 C. e5 v* R: k - sz = len(' '.join(s.split()))' h1 `. \! v& _
- if sz <= 190:: q# n7 M" u5 U f4 m7 Z
- keyArr.append(' '.join(s.split()))
1 P/ Q8 ~" }; c: w# P - #print '[ '+' '.join(s.split())+' ]'
& {5 `* Y6 H3 H, R - else:$ M+ q' ]2 z( Z9 _* B
- for ses in s.split(','):
4 [" m# c. T6 m* O. I# Z9 N5 N9 V - if len(ses) > 190:
+ f/ Y! `* p; i3 g4 Q& l - keyArr.extend(splitStr(ses))' B& s! v6 n5 e, i8 i
- else:5 p9 W! E1 x) q# r* o# M" @
- keyArr.append(' '.join(ses.split()))
7 p; U. \9 @0 s7 O# f G( {
7 B8 m4 a9 G; X! ?/ k- ! m8 q1 b6 p) Q+ }, Z' l
- API_URL = "http://translate.google.com/translate_tts?ie=UTF-8&tl=en-us"4 @% K& B% ~3 O. J2 Q$ T
- #key = HTMLParser.unescape.__func__(HTMLParser, translate(str,'en'))
. r6 K* i3 C) X. P: j - #keyArr = sentence_split(str)+ O$ p: W& Q. K1 w A& U$ E- a
- k7 e& `8 p' Y
2 Z: [% O9 y8 f c2 x# l5 v0 j0 K! ^# ~" _- #print([text[i:i+n] for i in xrange(0, len(text), n)]); e) e L8 \8 |8 D) }9 x
- os.system('rm ~/tmp/audio/*.mp3')5 a ~/ c3 h2 `% d, W
- namei = 0% M# b6 v1 e( P
- for i in keyArr:9 U3 |3 l2 P+ F- J# ~
- juzi = ' '.join(i.split())
" z, u( K3 O0 o7 t0 j U - sz = len(juzi)" Q6 n6 m: h5 `5 b. k& T
- link = API_URL+"&q="+juzi+"&client=tw-ob"
& c% K3 t2 m1 \3 c# h( c6 W7 m - print link- T& a, x8 I- j% ^
- path = "~/tmp/audio/%d.mp3" % namei h# R+ q8 U; k! Q& [" l8 V
- #print path
. j" E2 |* ?( K: G1 W( X$ W - DownloadAudio(link,path)
; A+ Z' Y) _3 y8 r4 B' b - os.system('file '+path)3 x. ? \% K2 ~2 u! E- l7 l
- time.sleep(1)
% |) d& N: N7 w$ } _ - namei = namei + 1
l% O/ p5 @3 _' v) C' I
复制代码
% l5 `' c2 U u) r( j I乱七八糟拼一拼 就是个小玩意~~~~~ |