summaryrefslogtreecommitdiff
blob: 5f4b280198e6c714bb3d643de4f9f3818b1bd0ab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
Index: grab/de_tvtoday/tv_grab_de_tvtoday.in
===================================================================
RCS file: /cvsroot/xmltv/xmltv/grab/de_tvtoday/tv_grab_de_tvtoday.in,v
retrieving revision 1.42
retrieving revision 1.45
diff -p -u -r1.42 -r1.45
--- grab/de_tvtoday/tv_grab_de_tvtoday.in	21 Apr 2006 17:17:21 -0000	1.42
+++ grab/de_tvtoday/tv_grab_de_tvtoday.in	25 May 2006 17:10:18 -0000	1.45
@@ -138,6 +138,7 @@ BEGIN {
     else {
         *t = \&Log::TraceMessages::t;
         *d = \&Log::TraceMessages::d;
+	#$Log::TraceMessages::On = 1;
     }
 }
 
@@ -502,9 +503,10 @@ sub parse_page($$) {
     }
 	
     #-- extract date of grabbed data from retrieved webpage ...
-    $_ = $page->look_down('_tag' => 'td', 'class' => 'navigator-hhead-large');
+    $_ = $page->look_down('_tag' => 'span', 'class' => 'text-weiss');
     die("cannot find date on requested page") 
       unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/);
+    t "extracted date: $3-$2-$1";
     $day = ParseDate("$3-$2-$1 00:00:00");
 
     #-- well, now let's scan the table for programme data
@@ -573,6 +575,7 @@ sub parse_page($$) {
 		$show{q(episode-num)} = [ [ $1, "onscreen" ] ];
 	    }
 
+	    t "show title: $span";
 	    $show{title} = [[ $span, $lang ]];
 	} 
 	elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") {
@@ -583,7 +586,7 @@ sub parse_page($$) {
 	    
 	    my $title = ($tag->content_list())[0];
 
-	    $title = convert_cp1252_chars(\$title);
+	    convert_cp1252_chars(\$title);
 
 	    $title =~ s/\s*\([^\(]+\)\s*$//;
 	    if ($title =~ s/\s*(\d+)\.\sTeil//gi) {
@@ -836,6 +839,7 @@ sub squeeze_out_desc($$) {
     # try to match <category>, <country> <year>; R: <names>; D: <names> construct
     # where <country>/<year> or the [RD]: stuff may be missing ...
     if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) {
+        t "split rule: <category>, <country> <year> ...";
 	$$desc = "";
 
 	#-- $parts[1] is the show title in English (doesn't have to be available)
@@ -887,9 +891,14 @@ sub squeeze_out_desc($$) {
 	}
     } 
     else {
+        t "split rule: dot splitting";
 	my @data = split "�", $$desc;
 	s/(^\s|\s$)//g foreach(@data); #CHG#
 
+	for(0 .. (scalar(@data) - 1)) {
+	    t "dot-split part $_: " . $data[$_];
+	}
+
 	if(scalar(@data) == 3 
 	   && not($data[1] =~ m/[\w�������]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-(  
 	   && $data[2] =~ m/^Mit (.*?)$/) {
@@ -945,15 +954,22 @@ sub squeeze_out_desc($$) {
 		    next;
 		}
 
-		if (my ($cat, $rest1, $names, $guests, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?) - G�ste: (..+?)(?:\s+-\s+(.+))?$/) {
+		if (my ($nocat, $cat, $rest1, $names, $guests, $rest2) = m/^(([^,.%^&*();]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?) - G�ste: (..+?)(?:\s+-\s+(.+))?$/) {
 		    my @data = split_up_names($names, $show);
 		    push @{$show->{"credits"}{"presenter"}}, @data;
 		    my @guest_data = split_up_names($guests, $show);
 		    push @{$show->{"credits"}{"guest"}}, @guest_data;
-		    $show->{"category"} = [[ $cat, $lang ]];
-		    
-	            warn "misdetected category: $cat" 
-		      if($cat =~ m/\d{4}/);
+
+		    if(defined($cat)) {
+			$show->{"category"} = [[ $cat, $lang ]];
+			
+			warn "misdetected category: $cat" 
+			  if($cat =~ m/\d{4}/);
+		    }
+		    else {
+		        t "no-cat match: $nocat";
+			$rest1 = $nocat;
+		    }
 		      
 		    my @rest; 
 		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
@@ -963,14 +979,21 @@ sub squeeze_out_desc($$) {
 		    next unless length($_);
 		}
 
-		if (my ($cat, $rest1, $names, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
+		if (my ($nocat, $cat, $rest1, $names, $rest2) = m/^(([^,]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
 		    my @data = split_up_names($names, $show);
 		    push @{$show->{"credits"}{"presenter"}}, @data;
-		    $show->{"category"} = [[ $cat, $lang ]];
 
-	            warn "misdetected category: $cat" 
-		      if($cat =~ m/\d{4}/);
-		    
+		    if(defined($cat)) {
+			$show->{"category"} = [[ $cat, $lang ]];
+
+			warn "misdetected category: $cat" 
+			  if($cat =~ m/\d{4}/);
+		    }
+		    else {
+		        t "no-cat match: $nocat";
+			$rest1 = $nocat;
+		    }
+			
 		    my @rest; 
 		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
 			push @rest, $_ if(defined($_) && length($_));