aboutsummaryrefslogtreecommitdiff
path: root/pym
diff options
context:
space:
mode:
authorCorentin Chary <corentin.chary@gmail.com>2012-09-15 23:20:30 +0200
committerCorentin Chary <corentin.chary@gmail.com>2012-09-15 23:21:30 +0200
commit9afb31fc65934e387cf93efb2b275946fa254e91 (patch)
tree1cda09864bba6747e246a0e1eecbad1272674934 /pym
parenteuscan: update TODO (diff)
downloadeuscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.gz
euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.bz2
euscan-9afb31fc65934e387cf93efb2b275946fa254e91.zip
euscan: fix #435118 and #435120
Signed-off-by: Corentin Chary <corentin.chary@gmail.com>
Diffstat (limited to 'pym')
-rw-r--r--pym/euscan/__init__.py11
-rw-r--r--pym/euscan/handlers/generic.py1
-rw-r--r--pym/euscan/helpers.py27
3 files changed, 21 insertions, 18 deletions
diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py
index 12c4a16..946c63c 100644
--- a/pym/euscan/__init__.py
+++ b/pym/euscan/__init__.py
@@ -77,11 +77,12 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)sourceforge(.*)',
'(.*)github.com',
'(.*)berlios(.*)',
- '(.*)qt.nokia.com(.*)',
- '(.*)chromium.org(.*)',
- '(.*)nodejs.org(.*)',
- '(.*)download.mono-project.com(.*)',
- '(.*)fedorahosted.org(.*)',
+ '(.*)qt\.nokia\.com(.*)',
+ '(.*)chromium\.org(.*)',
+ '(.*)nodejs\.org(.*)',
+ '(.*)download\.mono-project\.com(.*)',
+ '(.*)fedorahosted\.org(.*)',
+ '(.*)download\.tuxfamily\.org(.*)',
]
from out import EuscanOutput
diff --git a/pym/euscan/handlers/generic.py b/pym/euscan/handlers/generic.py
index fd82c71..0795488 100644
--- a/pym/euscan/handlers/generic.py
+++ b/pym/euscan/handlers/generic.py
@@ -70,6 +70,7 @@ def scan_html(data, url, pattern):
match.group(0))
)
+
return results
diff --git a/pym/euscan/helpers.py b/pym/euscan/helpers.py
index 5e3e6ea..3271811 100644
--- a/pym/euscan/helpers.py
+++ b/pym/euscan/helpers.py
@@ -373,27 +373,28 @@ def tryurl(fileurl, template):
def regex_from_template(template):
# Escape
- template = re.escape(template)
+ regexp = re.escape(template)
# Unescape specific stuff
- template = template.replace('\$\{', '${')
- template = template.replace('\}', '}')
- template = template.replace('}\.$', '}.$')
+ regexp = regexp.replace('\$\{', '${')
+ regexp = regexp.replace('\}', '}')
+ regexp = regexp.replace('}\.$', '}.$')
# Replace ${\d+}
- #template = template.replace('${0}', r'([\d]+?)')
- template = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', template)
+ #regexp = regexp.replace('${0}', r'([\d]+?)')
+ regexp = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', regexp)
- #template = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', template)
- #template = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', template)
- #template = re.sub(r'(\$\{\d+\})+', '(.+?)', template)
+ #regexp = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', regexp)
+ #regexp = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', regexp)
+ #regexp = re.sub(r'(\$\{\d+\})+', '(.+?)', regexp)
# Full version
- template = template.replace('${PV}', _v)
+ regexp = regexp.replace('${PV}', _v)
# End
- template = template + r'/?$'
- return template
+ regexp = regexp + r'/?$'
+
+ return regexp
def basedir_from_template(template):
@@ -417,7 +418,7 @@ def generate_scan_paths(url):
path = prefix + ":/"
for chunk in chunks:
if '${' in chunk:
- steps.append((path, regex_from_template(chunk)))
+ steps.append((path, '^(?:|.*/)' + regex_from_template(chunk)))
path = ""
else:
path += "/"