# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse hyperlinks in Word files.
"""
from . import _ParserPlugin
try:
import win32com
import pythoncom
has_win32com = True
Error = pythoncom.com_error
except ImportError:
has_win32com = False
Error = Exception
from .. import fileutil, log, LOG_PLUGIN
_initialized = False
[docs]
def init_win32com():
"""Initialize the win32com.client cache."""
global _initialized
if _initialized:
return
import win32com.client
if win32com.client.gencache.is_readonly:
# allow gencache to create the cached wrapper objects
win32com.client.gencache.is_readonly = False
# under py2exe the call in gencache to __init__() does not happen
# so we use Rebuild() to force the creation of the gen_py folder
# Note that the python...\win32com.client.gen_py dir must not exist
# to allow creation of the cache in %temp% for py2exe.
# This is ensured by excluding win32com.gen_py in setup.py
win32com.client.gencache.Rebuild()
_initialized = True
[docs]
def has_word():
"""Determine if Word is available on the current system."""
if not has_win32com:
return False
try:
import _winreg as winreg
except ImportError:
import winreg
try:
key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, "Word.Application")
winreg.CloseKey(key)
return True
except (OSError, ImportError):
pass
return False
[docs]
def constants(name):
"""Helper to return constants. Avoids importing win32com.client in
other modules."""
return getattr(win32com.client.constants, name)
[docs]
def get_word_app():
"""Return open Word.Application handle, or None if Word is not available
on this system."""
if not has_word():
return None
# Since this function is called from different threads, initialize
# the COM layer.
pythoncom.CoInitialize()
import win32com.client
app = win32com.client.gencache.EnsureDispatch("Word.Application")
app.Visible = False
return app
[docs]
def close_word_app(app):
"""Close Word application object."""
app.Quit()
[docs]
def open_wordfile(app, filename):
"""Open given Word file with application object."""
return app.Documents.Open(
filename,
ReadOnly=True,
AddToRecentFiles=False,
Visible=False,
NoEncodingDialog=True,
)
[docs]
def close_wordfile(doc):
"""Close word file."""
doc.Close()
[docs]
class WordParser(_ParserPlugin):
"""Word parsing plugin."""
def __init__(self, config):
"""Check for Word."""
init_win32com()
if not has_word():
log.warn(LOG_PLUGIN, "Microsoft Word not found for WordParser plugin")
super().__init__(config)
[docs]
def applies_to(self, url_data, pagetype=None):
"""Check for Word pagetype."""
return has_word() and pagetype == 'word'
[docs]
def check(self, url_data):
"""Parse Word data."""
content = url_data.get_raw_content()
filename = get_temp_filename(content)
# open word file and parse hyperlinks
try:
app = get_word_app()
try:
doc = open_wordfile(app, filename)
if doc is None:
raise Error("could not open word file %r" % filename)
try:
for link in doc.Hyperlinks:
line = get_line_number(doc, link.Range)
name = link.TextToDisplay
url_data.add_url(link.Address, name=name, line=line)
finally:
close_wordfile(doc)
finally:
close_word_app(app)
except Error as msg:
log.warn(LOG_PLUGIN, "Error parsing word file: %s", msg)
[docs]
def get_line_number(doc, wrange):
"""Get line number for given range object."""
lineno = 1
wrange.Select()
wdFirstCharacterLineNumber = constants("wdFirstCharacterLineNumber")
wdGoToLine = constants("wdGoToLine")
wdGoToPrevious = constants("wdGoToPrevious")
while True:
curline = doc.Selection.Information(wdFirstCharacterLineNumber)
doc.Selection.GoTo(wdGoToLine, wdGoToPrevious, Count=1, Name="")
lineno += 1
prevline = doc.Selection.Information(wdFirstCharacterLineNumber)
if prevline == curline:
break
return lineno
[docs]
def get_temp_filename(content):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc', prefix='lc_')
try:
fd.write(content)
finally:
fd.close()
return filename