euscan: fix #435118 and #435120

Signed-off-by: Corentin Chary <corentin.chary@gmail.com>
author: Corentin Chary <corentin.chary@gmail.com> 2012-09-15 23:20:30 +0200
committer: Corentin Chary <corentin.chary@gmail.com> 2012-09-15 23:21:30 +0200
commit: 9afb31fc65934e387cf93efb2b275946fa254e91 (patch)
tree: 1cda09864bba6747e246a0e1eecbad1272674934 /pym
parent: euscan: update TODO (diff)
download: euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.gz
euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.bz2
euscan-9afb31fc65934e387cf93efb2b275946fa254e91.zip
3 files changed, 21 insertions, 18 deletions
diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py
index 12c4a16..946c63c 100644
--- a/pym/euscan/__init__.py
+++ b/pym/euscan/__init__.py
@@ -77,11 +77,12 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [
     '(.*)sourceforge(.*)',
     '(.*)github.com',
     '(.*)berlios(.*)',
-    '(.*)qt.nokia.com(.*)',
-    '(.*)chromium.org(.*)',
-    '(.*)nodejs.org(.*)',
-    '(.*)download.mono-project.com(.*)',
-    '(.*)fedorahosted.org(.*)',
+    '(.*)qt\.nokia\.com(.*)',
+    '(.*)chromium\.org(.*)',
+    '(.*)nodejs\.org(.*)',
+    '(.*)download\.mono-project\.com(.*)',
+    '(.*)fedorahosted\.org(.*)',
+    '(.*)download\.tuxfamily\.org(.*)',
 ]
 
 from out import EuscanOutput
diff --git a/pym/euscan/handlers/generic.py b/pym/euscan/handlers/generic.py
index fd82c71..0795488 100644
--- a/pym/euscan/handlers/generic.py
+++ b/pym/euscan/handlers/generic.py
@@ -70,6 +70,7 @@ def scan_html(data, url, pattern):
                  match.group(0))
             )
 
+
     return results
 
 
diff --git a/pym/euscan/helpers.py b/pym/euscan/helpers.py
index 5e3e6ea..3271811 100644
--- a/pym/euscan/helpers.py
+++ b/pym/euscan/helpers.py
@@ -373,27 +373,28 @@ def tryurl(fileurl, template):
 
 def regex_from_template(template):
     # Escape
-    template = re.escape(template)
+    regexp = re.escape(template)
 
     # Unescape specific stuff
-    template = template.replace('\$\{', '${')
-    template = template.replace('\}', '}')
-    template = template.replace('}\.$', '}.$')
+    regexp = regexp.replace('\$\{', '${')
+    regexp = regexp.replace('\}', '}')
+    regexp = regexp.replace('}\.$', '}.$')
 
     # Replace ${\d+}
-    #template = template.replace('${0}', r'([\d]+?)')
-    template = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', template)
+    #regexp = regexp.replace('${0}', r'([\d]+?)')
+    regexp = re.sub(r'(\$\{\d+\}(\.?))+', r'([\w\.]+?)', regexp)
 
-    #template = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', template)
-    #template = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', template)
-    #template = re.sub(r'(\$\{\d+\})+', '(.+?)', template)
+    #regexp = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', regexp)
+    #regexp = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', regexp)
+    #regexp = re.sub(r'(\$\{\d+\})+', '(.+?)', regexp)
 
     # Full version
-    template = template.replace('${PV}', _v)
+    regexp = regexp.replace('${PV}', _v)
 
     # End
-    template = template + r'/?$'
-    return template
+    regexp = regexp + r'/?$'
+
+    return regexp
 
 
 def basedir_from_template(template):
@@ -417,7 +418,7 @@ def generate_scan_paths(url):
     path = prefix + ":/"
     for chunk in chunks:
         if '${' in chunk:
-            steps.append((path, regex_from_template(chunk)))
+            steps.append((path, '^(?:|.*/)'  + regex_from_template(chunk)))
             path = ""
         else:
             path += "/"
author	Corentin Chary <corentin.chary@gmail.com>	2012-09-15 23:20:30 +0200
committer	Corentin Chary <corentin.chary@gmail.com>	2012-09-15 23:21:30 +0200
commit	9afb31fc65934e387cf93efb2b275946fa254e91 (patch)
tree	1cda09864bba6747e246a0e1eecbad1272674934 /pym
parent	euscan: update TODO (diff)
download	euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.gz euscan-9afb31fc65934e387cf93efb2b275946fa254e91.tar.bz2 euscan-9afb31fc65934e387cf93efb2b275946fa254e91.zip