19 Commits

Author SHA1 Message Date
2806590e7f getItemsInSection 2026-04-30 15:29:22 -04:00
954dec3603 getItemsInFeed 2026-04-30 15:10:44 -04:00
9cb9c01471 speed up getItemsInSection 2026-04-30 15:08:01 -04:00
02bdeba314 improve removeHtml 2026-04-30 15:00:12 -04:00
af5a989c21 getItemsInSection 2026-04-30 14:46:43 -04:00
b860c2d0ef Fix getItemsInSection 2026-04-30 14:39:48 -04:00
09e3980d2c Rewrite getItemsInSection 2026-04-30 14:21:37 -04:00
1cdedee244 Logging 2026-04-30 14:05:54 -04:00
b55a299a3c Performance timer. 2026-04-30 13:05:05 -04:00
2fc96a3cc4 reduce feed log 2026-04-30 11:44:22 -04:00
7bb844d9ee Fix BetweenString 2026-04-30 11:39:07 -04:00
0ec1eaef39 Add logging 2026-04-30 09:56:03 -04:00
6e521c382c Fix typo in comments 2026-04-29 21:36:33 -04:00
cb5a1bfbbe Mege NF_0004 2026-04-27 12:47:13 -04:00
98c37f2204 Add try except in Video.fromString for invalid date 2026-04-22 12:43:53 -04:00
4cb76dfb58 Fix handling of <time> tag and replace html tags 2026-04-18 10:21:36 -04:00
e660e385e5 Reviewed with Anthropic Claude and made bug fixes. 2026-02-22 11:45:01 -05:00
13b18b01dd Fix date arithmetic in isFeedCacheAvailable 2026-01-29 23:27:58 -05:00
91383d8687 Fix strptime issue 2026-01-29 23:02:55 -05:00
5 changed files with 572 additions and 296 deletions

View File

@@ -6,8 +6,8 @@ from utility import *
from video import *
# This file is executed in a cron job.
# To view the cron schedule type sudo crontab -r in a shell. Use Ctrl-S to save after editing
# This cron job should run evrry 30 minutes. Shorter intervals burden the system
# To view the cron schedule type sudo crontab -l in a shell. sudo crontab -e for editing. Use Ctrl-S to save after editing
# This cron job should run evrry 10 minutes. Shorter intervals burden the system
# The ouptut from the print statements is generated in the syslog /var/log/syslog sudo nano /var/log/syslog
# Overall system perfromance can be monitored using htop

View File

@@ -28,7 +28,7 @@ CACHE_EXPIRY_MINS=10
LOG_HTTP_RESPONSES = False
FEED_REJECT_IF_OLDER_THAN_DAYS = 7
FEED_REJECT_IF_OLDER_THAN_DAYS = 60
class PathHelper:
pathChar="/"

View File

@@ -30,94 +30,126 @@ class NewsFeed:
return False
def getItemsInAmericasNewsRoomFeed(self,url):
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_AMERICAS_NEWSROOM_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
response.close()
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
video, searchIndex = sections.getItemsInSection(response.text,"article",searchIndex)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=False)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
response = None
try:
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_AMERICAS_NEWSROOM_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
video, searchIndex = sections.getItemsInSection(response.text,"article",searchIndex)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=False)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
finally:
if None!= response:
response.close()
def getItemsInOutnumberedFeed(self,url):
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_OUTNUMBERED_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
response.close()
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
video, searchIndex = sections.getItemsInSection(response.text,"article",searchIndex)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
response = None
try:
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_OUTNUMBERED_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
video, searchIndex = sections.getItemsInSection(response.text,"article",searchIndex)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
finally:
if None!=response:
response.close()
def getItemsInFeed(self,url):
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
self.writeLog(f"Loading videos from cache {cachePathFileName}")
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
self.writeLog(f"Loading videos from {url}")
response=httpNetRequest=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
response.close()
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
video, searchIndex= sections.getItemsInSection(response.text,"article",searchIndex)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
# videoList=list(videos.values())
videoList=self.filterFeedMaxDays(list(videos.values()),FEED_REJECT_IF_OLDER_THAN_DAYS)
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
response = None
parse_total = 0 # timing
time_total = 0 # timing
count = 0 # timing
try:
self.writeLog("getItemsInFeed[ENTER]")
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
self.writeLog(f"[getItemsInFeed] Loading videos from cache {cachePathFileName}")
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
self.writeLog(f"[getItemsInFeed] Loading videos from site '{url}'")
start_time = time.perf_counter()
response=httpNetRequest.getHttpNetRequest(url)
self.writeLog(f"[getItemsInFeed] Request from {url} completed in {time.perf_counter() - start_time:.4f} seconds.")
status=response.status_code
searchIndex=0
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(f"[getItemsInFeed] Request {url}")
self.writeLog(f"[getItemsInFeed] Returned {response.text}" )
self.writeLog(f"Received {len(response.text)} bytes.")
while -1!= searchIndex:
t0 = time.perf_counter() # timing
video, searchIndex= sections.getItemsInSection(response.text,"article",searchIndex)
t1 = time.perf_counter() # timing
parse_total += (t1 - t0) # timing
if video is not None and not (video.description in videos):
t2 = time.perf_counter() # timing
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
t3 = time.perf_counter() # timing
time_total += (t3 - t2) # timing
videos[video.description]=video
count += 1 # timing
t0 = time.perf_counter() # timing
videoList=self.filterFeedMaxDays(list(videos.values()),FEED_REJECT_IF_OLDER_THAN_DAYS)
t1 = time.perf_counter()
self.writeLog(f"[TIMING] filterFeedMaxDays took {t1 - t0:.4f} seconds")
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
finally:
self.writeLog(f"items: {count}") # timing
self.writeLog(f"parse_total: {parse_total:.2f}s") # timing
self.writeLog(f"time_total: {time_total:.2f}s") # timing
self.writeLog("getItemsInFeed[LEAVE]")
if None!=response:
response.close()
def filterFeedMaxDays(self, videos, days):
now = datetime.now()
@@ -125,91 +157,100 @@ class NewsFeed:
for video in videos:
delta = now - video.getFeedTime()
if delta.days <= days:
message = f"INCL. days={delta.days},feed time={video.getFeedTime()} feed time offset (strPublication)=:'{video.feedTimeOffset}', description={video.description}"
self.writeLog(message)
filteredList.insert(0,video)
# message = f"INCL. days={delta.days},feed time={video.getFeedTime()} feed time offset (strPublication)=:'{video.feedTimeOffset}', description={video.description}"
# self.writeLog(message)
filteredList.append(video)
else:
message = f"EXCL. days={delta.days},feed time={video.getFeedTime()} feed time offset (strPublication)=:'{video.feedTimeOffset}', description={video.description}"
self.writeLog(message)
pass
# message = f"EXCL. days={delta.days},feed time={video.getFeedTime()} feed time offset (strPublication)=:'{video.feedTimeOffset}', description={video.description}"
# self.writeLog(message)
return filteredList
def getUSItemsInFeed(self,url):
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_US_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
response.close()
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
videoId, searchIndex = sections.getVideoIdInSection(response.text,"article",searchIndex)
if videoId is None:
continue
url='https://video.foxnews.com/v/'+videoId
response = None
try:
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_US_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
innerResponse=httpNetRequest.getHttpNetRequest(url)
status=innerResponse.status_code
innerResponse.close()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
if status!=200:
continue
video=sections.getVideoContentInSection(innerResponse.text)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
videoId, searchIndex = sections.getVideoIdInSection(response.text,"article",searchIndex)
if videoId is None:
continue
videoUrl='https://video.foxnews.com/v/'+videoId
httpNetRequest=HttpNetRequest()
innerResponse=httpNetRequest.getHttpNetRequest(videoUrl)
status=innerResponse.status_code
innerResponse.close()
if status!=200:
continue
video=sections.getVideoContentInSection(innerResponse.text)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
finally:
if None!=response:
response.close()
def getExclusiveItemsInFeed(self,url):
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_EXCLUSIVE_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
response.close()
if status!=200:
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.Text)
while -1!= searchIndex:
videoId, searchIndex = sections.getVideoIdInSection(response.text,"article",searchIndex)
if videoId is None:
continue
url='https://video.foxnews.com/v/'+videoId
response = None
try:
now=datetime.now()
cachePathFileName=PathHelper.makePathFileName(VIDEODB_EXCLUSIVE_FILENAME,self.pathDb)
if self.isFeedCacheAvailable(cachePathFileName,CACHE_EXPIRY_MINS):
videos=self.readFeedCache(cachePathFileName)
if videos is not None:
return(videos)
sections=Sections()
videos = {}
httpNetRequest=HttpNetRequest()
innerResponse=httpNetRequest.getHttpNetRequest(url)
status=innerResponse.status_code
innerResponse.close()
response=httpNetRequest.getHttpNetRequest(url)
status=response.status_code
searchIndex=0
if status!=200:
continue
video=sections.getVideoContentInSection(innerResponse.text)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
return None
if LOG_HTTP_RESPONSES:
self.writeLog(url)
self.writeLog(response.text)
while -1!= searchIndex:
videoId, searchIndex = sections.getVideoIdInSection(response.text,"article",searchIndex)
if videoId is None:
continue
videoUrl='https://video.foxnews.com/v/'+videoId
httpNetRequest=HttpNetRequest()
innerResponse=httpNetRequest.getHttpNetRequest(videoUrl)
status=innerResponse.status_code
innerResponse.close()
if status!=200:
continue
video=sections.getVideoContentInSection(innerResponse.text)
if video is not None and not (video.description in videos):
videos[video.description]=video
video.setFeedTime(DateTimeHelper.applyRelativeTime(now,video.feedTimeOffset))
videoList=list(videos.values())
videoList=sorted(videoList, key=lambda x:x.getFeedTime(),reverse=True)
self.writeFeedCache(cachePathFileName,videoList)
return (videoList)
finally:
if None!=response:
response.close()
def getItemsInArchiveFeed(self,url,archiveDbFileName):
cachePathFileName=PathHelper.makePathFileName(archiveDbFileName,self.pathDb)
@@ -221,11 +262,11 @@ class NewsFeed:
def readFeedCache(self,pathFileName):
try:
videos=[]
# 'with' will automatically close the stream
with open(pathFileName,"r",encoding='utf-8') as inputStream:
for line in inputStream:
video=Video.fromString(line)
videos.append(video)
inputStream.close()
return(videos)
except:
self.writeLog(traceback.format_exc())
@@ -236,35 +277,36 @@ class NewsFeed:
with open(pathFileName,"w",encoding='utf-8') as outputStream:
for video in videos:
outputStream.write(video.toString()+"\n")
outputStream.close()
# 'with' will automatically close the stream
return(videos)
except:
self.writeLog(traceback.format_exc())
return(videos)
def isFeedCacheAvailable(self,pathFileName,expireMinutes):
def isFeedCacheAvailable(self, pathFileName, expireMinutes):
try:
self.writeLog('Inspecting cache file {pathFileName}'.format(pathFileName=pathFileName))
if not os.path.isfile(pathFileName):
return(False)
modifiedTime=os.path.getmtime(pathFileName)
convertTime=time.localtime(modifiedTime)
formatTime=time.strftime('%d%m%Y %H:%M:%S',convertTime)
fileDateTime=DateTimeHelper.strptime(formatTime,'%d%m%Y %H:%M:%S')
currentTime=datetime.now()
timedelta=currentTime-fileDateTime
hours, hremainder = divmod(timedelta.seconds,3600)
minutes, mremainder = divmod(timedelta.seconds,60)
self.writeLog('file is = "{age}" hours old'.format(age=hours))
self.writeLog('file is = "{age}" minutes old'.format(age=minutes))
if hours > 1 or minutes > expireMinutes:
self.archiveFile(pathFileName)
return(False)
return (True)
self.writeLog('Inspecting cache file {pathFileName}'.format(pathFileName=pathFileName))
if not os.path.isfile(pathFileName):
return False
modifiedTime = os.path.getmtime(pathFileName)
convertTime = time.localtime(modifiedTime)
formatTime = time.strftime('%d%m%Y %H:%M:%S', convertTime)
fileDateTime = time.strptime(formatTime, '%d%m%Y %H:%M:%S')
currentTime = datetime.now()
elapsed = currentTime - datetime(*(fileDateTime[0:6]))
totalSeconds = int(elapsed.total_seconds())
hours, remainder = divmod(totalSeconds, 3600)
minutes, _ = divmod(remainder, 60)
self.writeLog('file is = "{age}" hours old'.format(age=hours))
self.writeLog('file is = "{age}" minutes old'.format(age=minutes))
if hours > 1 or minutes > expireMinutes:
self.archiveFile(pathFileName)
return False
return True
except:
self.writeLog(traceback.format_exc());
return(False)
self.writeLog(traceback.format_exc())
return False
def archiveFile(self, pathFileName):
if not os.path.isfile(pathFileName):
return(False)
@@ -288,65 +330,198 @@ class Sections:
def __init__(self):
self.dummy=None
# def getItemsInSection(self, strInput, sectionName, searchIndex):
# video=None
# startSection='<'+sectionName
# endSection='</'+sectionName
# startIndex=strInput.find(startSection,searchIndex)
# if -1 == startIndex:
# searchIndex=-1
# return video, searchIndex
# endIndex=strInput.find(endSection,startIndex)
# if -1 == endIndex:
# searchIndex=-1
# return video, searchIndex
# searchIndex=endIndex+len(endSection)
# strContainingString=strInput[startIndex:endIndex+1+len(endSection)]
# if not strContainingString or strContainingString=="":
# return video, searchIndex
# indexPreview=strContainingString.find("preview=\"")
# if -1 == indexPreview:
# return video, searchIndex
# previewUrl=strContainingString[indexPreview:]
# previewUrl=self.betweenString(previewUrl,'"','"')
# if "tokenvod" in previewUrl:
# return video, searchIndex
# # Handle video description
# indexDescription=strContainingString.find("alt=\"")
# if -1 == indexDescription:
# return video, searchIndex
# description=strContainingString[indexDescription:]
# description=self.betweenString(description,'"','"')
# description=self.removeHtml(description)
# description=description.replace("- Fox News","")
# if "vod.foxbusiness" in description:
# return video, searchIndex
# # Handle video duration
# indexDuration=strContainingString.find("<div class=\"duration\">")
# if -1 != indexDuration:
# strDuration=strContainingString[indexDuration:]
# strDuration=self.betweenString(strDuration,">","<")
# description=description+" - "+strDuration
# # Handle video publication
# strPublication = ""
# indexPublication=strContainingString.find("<div class=\"pub-date\">")
# if -1 != indexPublication:
# strPublication=strContainingString[indexPublication:]
# strPublication=self.betweenString(strPublication,"<time>","</time>")
# description=description+" ("+strPublication+")"
# # Handle the icon
# icon=None
# indexIcon=strContainingString.find("srcset=")
# if -1 != indexIcon:
# icon=strContainingString[indexIcon:]
# icon=self.betweenString(icon,"\"","\"")
# splits=icon.split(',')
# icon=self.betweenString(splits[len(splits)-1],None,'?')
# icon=icon.strip()
# description = description.strip()
# video=Video(description,previewUrl,icon)
# video.feedTimeOffset=strPublication
# return video, searchIndex
def getItemsInSection(self, strInput, sectionName, searchIndex):
video=None
startSection='<'+sectionName
endSection='</'+sectionName
video = None
startIndex=strInput.find(startSection,searchIndex)
if -1 == startIndex:
searchIndex=-1
return video, searchIndex
startTag = '<' + sectionName
endTag = '</' + sectionName + '>'
endIndex=strInput.find(endSection,startIndex)
if -1 == endIndex:
searchIndex=-1
return video, searchIndex
# ---------------------------------------------------
# LOCATE SECTION BOUNDS (ONE SCAN)
# ---------------------------------------------------
startIndex = strInput.find(startTag, searchIndex)
if startIndex == -1:
return None, -1
searchIndex=endIndex+len(endSection)
strContainingString=strInput[startIndex:endIndex+1+len(endSection)]
endIndex = strInput.find(endTag, startIndex)
if endIndex == -1:
return None, -1
if not strContainingString or strContainingString=="":
return video, searchIndex
searchIndex = endIndex + len(endTag)
indexPreview=strContainingString.find("preview=\"")
if -1 == indexPreview:
return video, searchIndex
previewUrl=strContainingString[indexPreview:]
previewUrl=self.betweenString(previewUrl,'"','"')
if "tokenvod" in previewUrl:
return video, searchIndex
s = strInput
i = startIndex
end = endIndex
# ---------------------------------------------------
# CURSOR INITIALIZATION
# ---------------------------------------------------
cursor = i
previewUrl = None
description = None
pub = ""
icon = None
# ---------------------------------------------------
# SINGLE PASS SCAN THROUGH ARTICLE BLOCK
# ---------------------------------------------------
while cursor < end:
# ---------------- preview ----------------
if previewUrl is None:
p = s.find('preview="', cursor, end)
if p != -1:
p += 9
q = s.find('"', p, end)
if q != -1:
previewUrl = s[p:q]
if "tokenvod" in previewUrl:
return None, searchIndex
cursor = q + 1
continue
# ---------------- description ----------------
if description is None:
a = s.find('alt="', cursor, end)
if a != -1:
a += 5
b = s.find('"', a, end)
if b != -1:
description = self.removeHtml(s[a:b])
description = description.replace("- Fox News", "")
if "vod.foxbusiness" in description:
return None, searchIndex
cursor = b + 1
continue
# ---------------- duration ----------------
d = s.find('<div class="duration">', cursor, end)
if d != -1:
d += 24
e = s.find('</div>', d, end)
if e != -1:
if description:
description += " - " + s[d:e].strip()
cursor = e + 6
continue
# ---------------- publication ----------------
t = s.find('<time', cursor, end)
if t != -1:
t = s.find('>', t, end)
if t != -1:
u = s.find('</time>', t, end)
if u != -1:
pub = s[t+1:u].strip()
if pub and description:
description += " (" + pub + ")"
cursor = u + 7
continue
# ---------------- icon ----------------
r = s.find('srcset=', cursor, end)
if r != -1:
r += 8
q1 = s.find('"', r, end)
if q1 != -1:
q1 += 1
q2 = s.find('"', q1, end)
if q2 != -1:
parts = s[q1:q2].split(',')
last = parts[-1].strip()
icon = last.split('?')[0]
cursor = q2 + 1
continue
# advance if nothing matched
cursor += 1
# ---------------------------------------------------
# VALIDATION
# ---------------------------------------------------
if not previewUrl or not description:
return None, searchIndex
indexDescription=strContainingString.index("alt=\"")
description=strContainingString[indexDescription:]
description=self.betweenString(description,'"','"')
description=self.removeHtml(description)
description=description.replace("- Fox News","")
if "vod.foxbusiness" in description:
return video, searchIndex
indexDuration=strContainingString.index("<div class=\"duration\">")
if -1 != indexDuration:
strDuration=strContainingString[indexDuration:]
strDuration=self.betweenString(strDuration,">","<")
description=description+" - "+strDuration
indexPublication=strContainingString.index("<div class=\"pub-date\">")
if -1 != indexPublication:
strPublication=strContainingString[indexPublication:]
strPublication=self.betweenString(strPublication,"<time>","</time>")
description=description+" ("+strPublication+")"
icon=None
indexIcon=strContainingString.index("srcset=")
if -1 != indexIcon:
icon=strContainingString[indexIcon:]
icon=self.betweenString(icon,"\"","\"")
splits=icon.split(',')
icon=self.betweenString(splits[len(splits)-1],None,'?')
icon=icon.strip()
description = description.strip()
video=Video(description,previewUrl,icon)
video.feedTimeOffset=strPublication
return video, searchIndex
# ---------------------------------------------------
# BUILD OBJECT
# ---------------------------------------------------
video = Video(description, previewUrl, icon)
video.feedTimeOffset = pub
return video, searchIndex
def getVideoIdInSection(self, strInput, sectionName, searchIndex):
video=None
startSection='<'+sectionName
@@ -415,19 +590,37 @@ class Sections:
def betweenString(self, strItem, strBegin, strEnd ):
return StringHelper.betweenString(strItem, strBegin, strEnd)
def removeHtml(self,strItem):
if strItem is None:
return None
codes={"&#x27;","&#187;"}
for code in codes:
strItem=strItem.replace(code,"'")
strItem=strItem.replace("&amp;","&")
strItem=strItem.replace("&#x2018;","'")
strItem=strItem.replace("&#x2019;","'")
strItem=strItem.replace("&#x2014;","-")
strItem=strItem.replace("???","'")
return strItem
# def removeHtml(self,strItem):
# if strItem is None:
# return None
# codes={"&#x27;","&#187;"}
# for code in codes:
# strItem=strItem.replace(code,"'")
# strItem=strItem.replace("&amp;","&")
# strItem=strItem.replace("&#x2018;","")
# strItem=strItem.replace("&#x2019;","")
# strItem=strItem.replace("&#x2014;","-")
# strItem=strItem.replace("&#39;","'")
# strItem=strItem.replace("???","'")
# strItem=strItem.replace("&quot;","\"")
# return strItem
def removeHtml(self, s):
if not s:
return s
return (
s.replace("&#x27;", "'")
.replace("&#187;", "'")
.replace("&amp;", "&")
.replace("&#x2018;", "")
.replace("&#x2019;", "")
.replace("&#x2014;", "-")
.replace("&#39;", "'")
.replace("???", "'")
.replace("&quot;", '"')
)
def pad(str,filler,length):
stringLength=len(str)
sb=""
@@ -447,6 +640,19 @@ def parseDuration(strDuration):
# DON'T LEAVE ANYTHING OPEN BELOW THIS LINE BECAUSE THIS FILE IS IMPORTED BY OTHER MODULES AND ANY CODE NOT IN A CLASS WILL BE RUN
# strdate = "January 1, 2026"
# if DateTimeHelper.canstrptimeex(strdate):
# theDate = DateTimeHelper.strptimeex(strdate)
# if(not isinstance(theDate,datetime)):
# raise Exception('Invalid type for parameter')
# feedTimeOffset = "January 13, 2025"
# currentTime = datetime.now()
# for i in range(1,100):
# relativeTime = DateTimeHelper.applyRelativeTime(currentTime,feedTimeOffset)
# print(relativeTime)
#print(FOX_NEWS_URL)
# pathFileName='/home/pi/.kodi/addons/plugin.video.fox.news/resources/lib/videodb.txt'
@@ -468,10 +674,10 @@ def parseDuration(strDuration):
# newsFeed=NewsFeed('/home/pi/Projects/Python/NewsFeed/')
# newsFeed=NewsFeed('/home/pi/.kodi/addons/plugin.video.fox.news/resources/lib/videodb.txt')
# newsFeed=NewsFeed(PATH_VIDEO_DATABASE, myLog())
# newsFeed=NewsFeed('/home/pi/Projects/Python/NewsFeed/', myLog())
# newsFeed=NewsFeed('C:/Python/NewsFeed/Archive', myLog())
# videos=newsFeed.getItemsInFeed(FOX_NEWS_URL)
# for video in videos:
# if(video.description.startswith("Martha")):
# print(f"Description={video.description}")
# print(f"Url={video.url}")
# print(f"getTimestamp={video.getTimestamp().toStringMonthDay()}")
@@ -479,6 +685,7 @@ def parseDuration(strDuration):
# print(f"getFeedTime={video.getFeedTime()}")
# print(f"daysOld={(datetime.now()-video.getFeedTime()).days}")
# print(' ')
# print(f"Got {len(videos)} videos")
# pull the time out of the description and subtract it from the time we scanned the feed.
# the result will be the time of the article..use this to sort on.

View File

@@ -36,29 +36,62 @@ class StringHelper:
def betweenString(strItem, strBegin, strEnd):
if strItem is None:
return None
index=-1
if strBegin is None:
index=0
start = 0
else:
index = strItem.index(strBegin)
if -1==index:
return None
str=None
if strBegin is not None:
str=strItem[index+len(strBegin):]
else:
str=strItem
try:
if strBegin.startswith("<") and strBegin.endswith(">"):
tag_name = strBegin[1:-1]
start = strItem.index("<" + tag_name)
start = strItem.index(">", start) + 1
else:
start = strItem.index(strBegin) + len(strBegin)
except ValueError:
return None
if strEnd is None:
return str
index=str.index(strEnd)
if -1==index :
return strItem[start:]
try:
end = strItem.index(strEnd, start)
except ValueError:
return None
sb=""
for strIndex in range(0, len(str)-1):
if index==strIndex:
break
sb=sb+str[strIndex]
return (sb)
return strItem[start:end]
# @staticmethod
# def betweenString(strItem, strBegin, strEnd):
# if strItem is None:
# return None
# index = -1
# if strBegin is None:
# index = 0
# else:
# try:
# if strBegin.startswith("<") and strBegin.endswith(">"):
# tag_name = strBegin[1:-1] # e.g. "time"
# index = strItem.index("<" + tag_name)
# index = strItem.index(">", index) + 1
# else:
# index = strItem.index(strBegin) + len(strBegin)
# except ValueError:
# return None
# if index == -1:
# return None
# str = strItem[index:] if strBegin is not None else strItem
# if strEnd is None:
# return str
# try:
# index = str.index(strEnd)
# except ValueError:
# return None
# sb = ""
# for strIndex in range(0, len(str) - 1):
# if index == strIndex:
# break
# sb = sb + str[strIndex]
# return sb
class HttpNetRequest:
def __init__(self):
@@ -117,66 +150,97 @@ class DateTimeHelper:
def getCurrentDateTime():
return datetime.now()
# January 1, 2026
@staticmethod
def strptime(theTime,theFormat):
try:
return datetime.strptime(theTime,theFormat)
except:
return datetime(*(time.strptime(theTime,theFormat)[0:6]))
def strptime(date_string):
month_map = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12
}
@staticmethod
def canstrptime(theTime,theFormat):
try:
datetime.strptime(theTime,theFormat)
return True
except:
date_string = date_string.replace(',', '')
parts = date_string.split()
if len(parts) == 3:
month_str, day_str, year_str = parts
month = month_map.get(month_str)
day = int(day_str)
year = int(year_str)
if month is not None:
return datetime(year, month, day)
else:
raise ValueError("Invalid month name in date string")
else:
raise ValueError("Date string format is incorrect")
# January 1, 2026
@staticmethod
def canstrptime(date_string):
month_map = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12
}
date_string = date_string.replace(',', '')
parts = date_string.split()
if len(parts) != 3:
return False
month_str, day_str, year_str = parts
month = month_map.get(month_str)
if month is None:
return False
day = int(day_str)
year = int(year_str)
return True
# returns a datetime
@staticmethod
def applyRelativeTime(sometime,relativetime):
relativeTimeResult = sometime
if(not isinstance(sometime,datetime)):
raise Exception('Invalid type for parameter')
if(not isinstance(relativetime,str)):
raise Exception('Invalid type for parameter')
if DateTimeHelper.canstrptime(relativetime,'%B %d, %Y'):
sometime = DateTimeHelper.strptime(relativetime,'%B %d, %Y')
return sometime
if DateTimeHelper.canstrptime(relativetime):
relativeTimeResult = DateTimeHelper.strptime(relativetime)
return relativeTimeResult
if relativetime=='just now':
return sometime
return relativeTimeResult
if relativetime=='just in':
return sometime
return relativeTimeResult
relativetimesplit=relativetime.split()
if len(relativetimesplit)==2:
year=datetime.now().year
relativetimex=relativetime+', '+str(year)
relativeDate = DateTimeHelper.strptime(relativetimex, '%B %d, %Y')
relativeDate = DateTimeHelper.strptime(relativetimex)
if(relativeDate>datetime.now()):
year=datetime.now().year-1
relativetimex=relativetime+', '+str(year)
relativeDate=DateTimeHelper.strptime(relativetimex,'%B %d, %Y')
relativeDate=DateTimeHelper.strptime(relativetimex)
days=sometime-relativeDate
sometime=sometime-days
relativeTimeResult=sometime-days
elif relativetimesplit[1]=='hour' or relativetimesplit[1]=='hours':
hours=int(relativetimesplit[0])
sometime=sometime-timedelta(hours=hours)
relativeTimeResult=sometime-timedelta(hours=hours)
elif relativetimesplit[1]=='day' or relativetimesplit[1]=='days':
days=int(relativetimesplit[0])
sometime=sometime-timedelta(days=days)
relativeTimeResult=sometime-timedelta(days=days)
elif relativetimesplit[1]=='minute' or relativetimesplit[1]=='minutes':
minutes=int(relativetimesplit[0])
sometime=sometime-timedelta(minutes=minutes)
relativeTimeResult=sometime-timedelta(minutes=minutes)
elif len(relativetimesplit)==3: # '16 mins ago' '2 hours ago'
if relativetimesplit[1]=='mins':
minutes=int(relativetimesplit[0])
sometime=sometime-timedelta(minutes=minutes)
relativeTimeResult=sometime-timedelta(minutes=minutes)
elif relativetimesplit[1]=='hours':
hours=int(relativetimesplit[0])
sometime=sometime-timedelta(hours=hours)
relativeTimeResult=sometime-timedelta(hours=hours)
elif relativetimesplit[1]=='day' or relativetimesplit[1]=='days':
days=int(relativetimesplit[0])
sometime=sometime-timedelta(days=days)
return sometime
relativeTimeResult=sometime-timedelta(days=days)
return relativeTimeResult
class DateTime:
def __init__(self):

View File

@@ -68,7 +68,12 @@ class Video:
description=splits[0].strip()
url=splits[1].strip()
icon=splits[2].strip()
timestamp=DateTime(splits[3].strip())
datePart = splits[3].strip()
timestamp = DateTime()
try :
timestamp=DateTime(datePart)
except Exception as exception:
print(f"Encountered invalid date '{datePart}'")
return(Video(description,url,icon,timestamp))
@staticmethod