1
22
23 package com.liferay.portal.search.lucene;
24
25 import com.liferay.portal.kernel.io.unsync.UnsyncByteArrayInputStream;
26 import com.liferay.portal.kernel.util.CharPool;
27 import com.liferay.portal.kernel.util.FileUtil;
28 import com.liferay.portal.kernel.util.Validator;
29 import com.liferay.portal.util.PropsValues;
30
31 import java.io.File;
32 import java.io.FileInputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35
36 import org.apache.lucene.document.Field;
37
38
43 public class LuceneFileExtractor {
44
45 public Field getFile(String field, InputStream is, String fileExt) {
46 String text = FileUtil.extractText(is, fileExt);
47
48 if (Validator.isNotNull(
49 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
50
51 text = regexpStrip(text);
52 }
53
54 return LuceneFields.getText(field, text);
55 }
56
57 public Field getFile(String field, byte[] bytes, String fileExt) {
58 InputStream is = new UnsyncByteArrayInputStream(bytes);
59
60 return getFile(field, is, fileExt);
61 }
62
63 public Field getFile(String field, File file, String fileExt)
64 throws IOException {
65
66 InputStream is = new FileInputStream(file);
67
68 return getFile(field, is, fileExt);
69 }
70
71 protected String regexpStrip(String text) {
72 char[] array = text.toCharArray();
73
74 for (int i = 0; i < array.length; i++) {
75 String s = String.valueOf(array[i]);
76
77 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
78 array[i] = CharPool.SPACE;
79 }
80 }
81
82 return new String(array);
83 }
84
85 }