Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 :
21 : #include <stdio.h>
22 : #include <sal/main.h>
23 : #include <osl/file.h>
24 : #include <osl/thread.h>
25 : #include <rtl/alloc.h>
26 : #include <rtl/ustring.hxx>
27 : #include <rtl/strbuf.hxx>
28 :
29 : #include "pdfparse.hxx"
30 :
31 : using namespace pdfparse;
32 :
33 : using ::rtl::OUString;
34 : using ::rtl::OString;
35 : using ::rtl::OStringBuffer;
36 : using ::rtl::OStringToOUString;
37 :
38 0 : void printHelp( const char* pExe )
39 : {
40 : fprintf( stdout,
41 : "USAGE: %s [-h,--help]\n"
42 : " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
43 : " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
44 : " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
45 : " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
46 : " -h, --help: show help\n"
47 : " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
48 : " and prints the mimetype found to stdout\n"
49 : " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
50 : " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
51 : " object numbers, where object number and generation number are separated by \':\'\n"
52 : " an omitted generation number defaults to 0\n"
53 : " -pw, --password: use password for decryption\n"
54 : "\n"
55 : "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
56 0 : , pExe, pExe, pExe, pExe, pExe );
57 0 : }
58 :
59 : class FileEmitContext : public EmitContext
60 : {
61 : oslFileHandle m_aHandle;
62 : oslFileHandle m_aReadHandle;
63 : unsigned int m_nReadLen;
64 :
65 : void openReadFile( const char* pOrigName );
66 :
67 : public:
68 : FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
69 : virtual ~FileEmitContext();
70 :
71 : virtual bool write( const void* pBuf, unsigned int nLen ) throw();
72 : virtual unsigned int getCurPos() throw();
73 : virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw();
74 : virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw();
75 : };
76 :
77 0 : FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
78 : : EmitContext( pTop ),
79 : m_aHandle( NULL ),
80 : m_aReadHandle( NULL ),
81 0 : m_nReadLen( 0 )
82 : {
83 0 : OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) );
84 0 : OUString aURL;
85 0 : if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
86 : {
87 0 : fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
88 : return;
89 : }
90 :
91 0 : if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
92 : {
93 0 : if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
94 : {
95 0 : fprintf( stderr, "could not truncate %s\n", pFileName );
96 0 : osl_closeFile( m_aHandle );
97 0 : m_aHandle = NULL;
98 : }
99 : }
100 0 : else if( osl_openFile( aURL.pData, &m_aHandle,
101 0 : osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
102 : {
103 0 : fprintf( stderr, "could not open %s\n", pFileName );
104 : return;
105 : }
106 0 : m_bDeflate = true;
107 :
108 0 : openReadFile( pOrigName );
109 : }
110 :
111 0 : FileEmitContext::~FileEmitContext()
112 : {
113 0 : if( m_aHandle )
114 0 : osl_closeFile( m_aHandle );
115 0 : if( m_aReadHandle )
116 0 : osl_closeFile( m_aReadHandle );
117 0 : }
118 :
119 0 : void FileEmitContext::openReadFile( const char* pInFile )
120 : {
121 0 : OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) );
122 0 : OUString aURL;
123 0 : if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
124 : {
125 0 : fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
126 : return;
127 : }
128 :
129 0 : if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
130 : {
131 0 : fprintf( stderr, "could not open %s\n", pInFile );
132 : return;
133 : }
134 :
135 0 : if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
136 : {
137 0 : fprintf( stderr, "could not seek to end of %s\n", pInFile );
138 0 : osl_closeFile( m_aReadHandle );
139 : return;
140 : }
141 :
142 0 : sal_uInt64 nFileSize = 0;
143 0 : if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
144 : {
145 0 : fprintf( stderr, "could not get end pos of %s\n", pInFile );
146 0 : osl_closeFile( m_aReadHandle );
147 : return;
148 : }
149 :
150 0 : m_nReadLen = static_cast<unsigned int>(nFileSize);
151 : }
152 :
153 0 : bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw()
154 : {
155 0 : if( ! m_aHandle )
156 0 : return false;
157 :
158 0 : sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
159 0 : sal_uInt64 nWritten = 0;
160 0 : return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
161 0 : && nWrite == nWritten;
162 : }
163 :
164 0 : unsigned int FileEmitContext::getCurPos() throw()
165 : {
166 0 : sal_uInt64 nFileSize = 0;
167 0 : if( m_aHandle )
168 : {
169 0 : if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
170 0 : nFileSize = 0;
171 : }
172 0 : return static_cast<unsigned int>(nFileSize);
173 : }
174 :
175 0 : bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
176 : {
177 0 : if( nOrigOffset + nLen > m_nReadLen )
178 0 : return false;
179 :
180 0 : if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
181 : {
182 0 : fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
183 0 : return false;
184 : }
185 0 : void* pBuf = rtl_allocateMemory( nLen );
186 0 : if( ! pBuf )
187 0 : return false;
188 0 : sal_uInt64 nBytesRead = 0;
189 0 : if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
190 : || nBytesRead != static_cast<sal_uInt64>(nLen) )
191 : {
192 0 : fprintf( stderr, "could not read %u bytes\n", nLen );
193 0 : rtl_freeMemory( pBuf );
194 0 : return false;
195 : }
196 0 : bool bRet = write( pBuf, nLen );
197 0 : rtl_freeMemory( pBuf );
198 0 : return bRet;
199 : }
200 :
201 0 : unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
202 : {
203 0 : if( nOrigOffset + nLen > m_nReadLen )
204 0 : return 0;
205 :
206 0 : if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
207 : {
208 0 : fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
209 0 : return 0;
210 : }
211 0 : sal_uInt64 nBytesRead = 0;
212 0 : if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
213 0 : return 0;
214 0 : return static_cast<unsigned int>(nBytesRead);
215 : }
216 :
217 : typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
218 :
219 0 : int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
220 : {
221 :
222 0 : PDFReader aParser;
223 0 : int nRet = 0;
224 0 : PDFEntry* pEntry = aParser.read( pInFile );
225 0 : if( pEntry )
226 : {
227 0 : PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry);
228 0 : if( pPDFFile )
229 : {
230 0 : fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
231 0 : if( pPassword )
232 : fprintf( stdout, "password %s\n",
233 0 : pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
234 0 : nRet = pHdl( pInFile, pOutFile, pPDFFile );
235 : }
236 : else
237 0 : nRet = 20;
238 0 : delete pEntry;
239 : }
240 0 : return nRet;
241 : }
242 :
243 0 : int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
244 : {
245 0 : FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
246 0 : aContext.m_bDecrypt = pPDFFile->isEncrypted();
247 0 : pPDFFile->emit(aContext);
248 0 : return 0;
249 : }
250 :
251 0 : int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
252 : {
253 0 : int nRet = 0;
254 0 : unsigned int nArrayElements = pStreams->m_aSubElements.size();
255 0 : for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
256 : {
257 0 : PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]);
258 0 : PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]);
259 0 : if( ! pMimeType )
260 0 : fprintf( stderr, "error: no mimetype element\n" );
261 0 : if( ! pStreamRef )
262 0 : fprintf( stderr, "error: no stream ref element\n" );
263 0 : if( pMimeType && pStreamRef )
264 : {
265 : fprintf( stdout, "found stream %d %d with mimetype %s\n",
266 : pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
267 0 : pMimeType->m_aName.getStr() );
268 0 : PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
269 0 : if( pObject )
270 : {
271 0 : rtl::OStringBuffer aOutStream( pOutFile );
272 0 : aOutStream.append( "_stream_" );
273 0 : aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
274 0 : aOutStream.append( "_" );
275 0 : aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
276 0 : FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
277 0 : aContext.m_bDecrypt = pPDFFile->isEncrypted();
278 0 : pObject->writeStream( aContext, pPDFFile );
279 : }
280 : else
281 : {
282 0 : fprintf( stderr, "object not found\n" );
283 0 : nRet = 121;
284 0 : }
285 : }
286 : else
287 0 : nRet = 120;
288 : }
289 0 : return nRet;
290 : }
291 :
292 0 : int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
293 : {
294 : // find all trailers
295 0 : int nRet = 0;
296 0 : unsigned int nElements = pPDFFile->m_aSubElements.size();
297 0 : for( unsigned i = 0; i < nElements && nRet == 0; i++ )
298 : {
299 0 : PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]);
300 0 : if( pTrailer && pTrailer->m_pDict )
301 : {
302 : // search for AdditionalStreams entry
303 0 : boost::unordered_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream;
304 0 : add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
305 0 : if( add_stream != pTrailer->m_pDict->m_aMap.end() )
306 : {
307 0 : PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
308 0 : if( pStreams )
309 0 : nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
310 : }
311 : }
312 : }
313 0 : return nRet;
314 : }
315 :
316 0 : int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
317 : {
318 0 : int nRet = 0;
319 0 : unsigned int nElements = i_pPDFFile->m_aSubElements.size();
320 0 : for( unsigned i = 0; i < nElements && nRet == 0; i++ )
321 : {
322 : // search FontDescriptors
323 0 : PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]);
324 0 : if( ! pObj )
325 0 : continue;
326 0 : PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
327 0 : if( ! pDict )
328 0 : continue;
329 :
330 : boost::unordered_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it =
331 0 : pDict->m_aMap.find( "Type" );
332 0 : if( map_it == pDict->m_aMap.end() )
333 0 : continue;
334 :
335 0 : PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
336 0 : if( ! pName )
337 0 : continue;
338 0 : if( ! pName->m_aName.equals( "FontDescriptor" ) )
339 0 : continue;
340 :
341 : // the font name will be helpful, also there must be one in
342 : // a font descriptor
343 0 : map_it = pDict->m_aMap.find( "FontName" );
344 0 : if( map_it == pDict->m_aMap.end() )
345 0 : continue;
346 0 : pName = dynamic_cast<PDFName*>(map_it->second);
347 0 : if( ! pName )
348 0 : continue;
349 0 : rtl::OString aFontName( pName->m_aName );
350 :
351 0 : PDFObjectRef* pStreamRef = 0;
352 0 : const char* pFileType = NULL;
353 : // we have a font descriptor, try for a type 1 font
354 0 : map_it = pDict->m_aMap.find( "FontFile" );
355 0 : if( map_it != pDict->m_aMap.end() )
356 : {
357 0 : pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
358 0 : if( pStreamRef )
359 0 : pFileType = "pfa";
360 : }
361 :
362 : // perhaps it's a truetype file ?
363 0 : if( ! pStreamRef )
364 : {
365 0 : map_it = pDict->m_aMap.find( "FontFile2" );
366 0 : if( map_it != pDict->m_aMap.end() )
367 : {
368 0 : pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
369 0 : if( pStreamRef )
370 0 : pFileType = "ttf";
371 : }
372 : }
373 :
374 0 : if( ! pStreamRef )
375 0 : continue;
376 :
377 0 : PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
378 0 : if( ! pStream )
379 0 : continue;
380 :
381 0 : rtl::OStringBuffer aOutStream( i_pOutFile );
382 0 : aOutStream.append( "_font_" );
383 0 : aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
384 0 : aOutStream.append( "_" );
385 0 : aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
386 0 : aOutStream.append( "_" );
387 0 : aOutStream.append( aFontName );
388 0 : if( pFileType )
389 : {
390 0 : aOutStream.append( "." );
391 0 : aOutStream.append( pFileType );
392 : }
393 0 : FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
394 0 : aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
395 0 : pStream->writeStream( aContext, i_pPDFFile );
396 0 : }
397 0 : return nRet;
398 : }
399 :
400 0 : std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
401 :
402 0 : int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
403 : {
404 0 : int nRet = 0;
405 0 : unsigned int nElements = s_aEmitObjects.size();
406 0 : for( unsigned i = 0; i < nElements && nRet == 0; i++ )
407 : {
408 0 : sal_Int32 nObject = s_aEmitObjects[i].first;
409 0 : sal_Int32 nGeneration = s_aEmitObjects[i].second;
410 0 : PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
411 0 : if( ! pStream )
412 : {
413 0 : fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration );
414 0 : continue;
415 : }
416 :
417 0 : rtl::OStringBuffer aOutStream( i_pOutFile );
418 0 : aOutStream.append( "_stream_" );
419 0 : aOutStream.append( nObject );
420 0 : aOutStream.append( "_" );
421 0 : aOutStream.append( nGeneration );
422 0 : FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
423 0 : aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
424 0 : pStream->writeStream( aContext, i_pPDFFile );
425 0 : }
426 0 : return nRet;
427 : }
428 :
429 0 : SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
430 : {
431 0 : const char* pInFile = NULL;
432 0 : const char* pOutFile = NULL;
433 0 : const char* pPassword = NULL;
434 0 : OStringBuffer aOutFile( 256 );
435 0 : PDFFileHdl aHdl = write_unzipFile;
436 :
437 0 : for( int nArg = 1; nArg < argc; nArg++ )
438 : {
439 0 : if( argv[nArg][0] == '-' )
440 : {
441 0 : if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
442 0 : ! rtl_str_compare( "--password" , argv[nArg] ) )
443 : {
444 0 : if( nArg == argc-1 )
445 : {
446 0 : fprintf( stderr, "no password given\n" );
447 0 : return 1;
448 : }
449 0 : nArg++;
450 0 : pPassword = argv[nArg];
451 : }
452 0 : else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
453 0 : ! rtl_str_compare( "--help", argv[nArg] ) )
454 : {
455 0 : printHelp( argv[0] );
456 0 : return 0;
457 : }
458 0 : else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
459 0 : ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
460 : {
461 0 : aHdl = write_addStreams;
462 : }
463 0 : else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
464 0 : ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
465 : {
466 0 : aHdl = write_fonts;
467 : }
468 0 : else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
469 0 : ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
470 : {
471 0 : aHdl = write_objects;
472 0 : nArg++;
473 0 : if( nArg < argc )
474 : {
475 0 : rtl::OString aObjs( argv[nArg] );
476 0 : sal_Int32 nIndex = 0;
477 0 : while( nIndex != -1 )
478 : {
479 0 : rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) );
480 0 : sal_Int32 nObject = 0;
481 0 : sal_Int32 nGeneration = 0;
482 0 : sal_Int32 nGenIndex = 0;
483 0 : nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
484 0 : if( nGenIndex != -1 )
485 0 : nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
486 0 : s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
487 0 : }
488 : }
489 : }
490 : else
491 : {
492 : fprintf( stderr, "unrecognized option \"%s\"\n",
493 0 : argv[nArg] );
494 0 : printHelp( argv[0] );
495 0 : return 1;
496 : }
497 : }
498 0 : else if( pInFile == NULL )
499 0 : pInFile = argv[nArg];
500 0 : else if( pOutFile == NULL )
501 0 : pOutFile = argv[nArg];
502 : }
503 0 : if( ! pInFile )
504 : {
505 0 : fprintf( stderr, "no input file given\n" );
506 0 : return 10;
507 : }
508 0 : if( ! pOutFile )
509 : {
510 0 : OString aFile( pInFile );
511 0 : if( aFile.getLength() > 0 )
512 : {
513 0 : if( aFile.getLength() > 4 )
514 : {
515 0 : if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) )
516 0 : aOutFile.append( pInFile, aFile.getLength() - 4 );
517 : else
518 0 : aOutFile.append( aFile );
519 : }
520 0 : aOutFile.append( "_unzip.pdf" );
521 0 : pOutFile = aOutFile.getStr();
522 : }
523 : else
524 : {
525 0 : fprintf( stderr, "no output file given\n" );
526 0 : return 11;
527 0 : }
528 : }
529 :
530 0 : return handleFile( pInFile, pOutFile, pPassword, aHdl );
531 0 : }
532 :
533 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|