Newsfeed/archive.py

import os
import glob
import functools
from environment import *
from utility import *
from video import *

# This file is executed in a cron job.
# To view the cron schedule type sudo crontab -r in a shell.  Use Ctrl-S to save after editing
# This cron job should run evrry 30 minutes.  Shorter intervals burden the system
# The ouptut from the print statements is generated in the syslog /var/log/syslog    sudo nano /var/log/syslog
# Overall system perfromance can be monitored using htop

def comparator(item1, item2):
    try:
      list1=item1.split('.')
      list2=item2.split('.')
      index1=int(list1[len(list1)-1])
      index2=int(list2[len(list2)-1])
      if index1<index2:
        return -1
      elif index1>index2:
        return 1
      return 0
    except:
      return 0

def createArchive(pathOutputFile,tokens,files):
   lines=0
   unique={}
   videos={}

   for token in tokens:
     print('Filtering for "{token}"'.format(token=token))

   videos = Video.load(pathOutputFile)

   for video in list(videos.values()):
     description = description=createDescription(video.description,video.getTimestamp())
     if not description in unique:
       unique[description]=createDescription(video.description, video.getTimestamp())

   try:
     print('found {count} archive files.'.format(count=len(files)))
     print('processing {pathOutputFile}'.format(pathOutputFile=pathOutputFile))
     for file in files:
        try:
          with open(file, "r", encoding='utf-8') as inputStream:
              for line in inputStream:
                lowerLine=line.lower()
                for token in tokens:
                  token=token.lower()
                  result = lowerLine.find(token)
                  if -1 != result:
                    video = Video.fromString(line)
                    heading = video.getDescription()
                    if not heading in unique:
                      unique[heading]=heading
                      video = Video.fromString(line)
                      video.description=createDescription(video.description,video.getTimestamp())
                      videos[video.description]=video
                      lines = lines + 1
          inputStream.close()
        except Exception as exception:
          print('Exception reading {file} {exception}'.format(file=file,exception=exception))
          continue
     print('writing {pathOutputFile}'.format(pathOutputFile=pathOutputFile))
     Video.write(pathOutputFile, videos)
   except Exception as exception:
     print('Exception creating output file {file} {exception}'.format(file=pathOutputFile,exception=exception))
   return

# clean the archive files by removing files older than 'expiryDays'
def cleanArchive(files, expiryDays):
  expiredList = []
  for pathFileName in files:
    modification_date = os.path.getmtime(pathFileName)
    modification_date = datetime.fromtimestamp(modification_date, timezone.utc)
    now = DateTime.now()
    days, hours, minutes, seconds = DateTime.deltaTime(modification_date, now)
    if(days > expiryDays):
      expiredList.append(pathFileName)
  print('Expiring {count} files.'.format(count=len(expiredList)))
  for file in expiredList:
    os.remove(file)
  return

def createDescription(strDescription, timeStamp):
  textElement=StringHelper.betweenString(strDescription,None,'-')
  timeElement=StringHelper.betweenString(strDescription,'-',None)
  durationElement=StringHelper.betweenString(timeElement,' ',' ')
  newDescription=textElement+'-'+' '+ durationElement+' ('+timeStamp.toStringMonthDay()+')'
  return newDescription

def getFiles(archiveFileLike):
  files = glob.glob(archiveFileLike)
  files=files+glob.glob(archiveFileLike+'.*')
  return files

# This program runs through all of the videodb*.txt files looking for keywords with which to
# build each of the individually named mini-archives.
# 1) Search for all videodb.txt.* files
# 2) Expire files older than specified number of days
# 3) Load the archive (for each fo the types enumerated below)
# 4) Run through file collection for the given archive archive and append to the archive as tags are found
# 5) Sort the archive
# 6) Truncate existing archive if it exists
# 7) Write the new archive

path=PATH_VIDEO_DATABASE
archiveFile=path+'/videodb'
archiveFileLike=archiveFile+'.txt'

#For debugging
# path='/home/pi/Projects/Python/NewsFeed/Archive'
# archiveFile=path+'/videodb'
# archiveFileLike=archiveFile+'.txt'

files = getFiles(archiveFileLike)
print('There are {count} archive files to process before cleaning'.format(count=len(files)))
cleanArchive(files, 30)
files = getFiles(archiveFileLike)
print('There are {count} archive files to process after cleaning'.format(count=len(files)))

print('archive.py running...')

archiveFileName=ARCHIVEDB_FILENAME
pathOutputFile=PathHelper.makePathFileName(archiveFileName,path)
print('pathOutputFile={pathOutputFile}'.format(pathOutputFile=pathOutputFile))
tokens=["Keane","Jesse","Israel","Hamas"," War ","Iran","Hezzbollah","Gaza","Ukraine"]
createArchive(pathOutputFile,tokens,files)

hannityFileName=HANNITYARCHIVEDB_FILENAME
pathOutputFile=PathHelper.makePathFileName(hannityFileName,path)
print('pathOutputFile={pathOutputFile}'.format(pathOutputFile=pathOutputFile))
tokens=["Hannity"]
createArchive(pathOutputFile,tokens,files)

levinFileName=LEVINARCHIVEDB_FILENAME
pathOutputFile=PathHelper.makePathFileName(levinFileName,path)
print('pathOutputFile={pathOutputFile}'.format(pathOutputFile=pathOutputFile))
tokens=["Levin"]
createArchive(pathOutputFile,tokens,files)

hawleyFileName=HAWLEYARCHIVEDB_FILENAME
pathOutputFile=PathHelper.makePathFileName(hawleyFileName,path)
print('pathOutputFile={pathOutputFile}'.format(pathOutputFile=pathOutputFile))
tokens=["Hawley"]
createArchive(pathOutputFile,tokens,files)

militaryFileName=MILITARYARCHIVEDB_FILENAME
pathOutputFile=PathHelper.makePathFileName(militaryFileName,path)
print('pathOutputFile={pathOutputFile}'.format(pathOutputFile=pathOutputFile))
tokens=["Keane","Kellogg","Russia","Ukraine","Israel","Korea","Iran","Venezuela","Cuba","China"]
createArchive(pathOutputFile,tokens,files)

print('archive.py done.')